evalscope_v0.17.0/evalscope.0.17.0/evalscope/collections/evaluator.py

376 lines
17 KiB
Python

import json
import os
import pandas as pd
import random
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from copy import deepcopy
from tabulate import tabulate
from tqdm import tqdm
from typing import Any, Dict, List
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.collections.sampler import DatasetEntry
from evalscope.config import TaskConfig
from evalscope.constants import AnswerKeys, DataCollection, DumpMode, EvalType
from evalscope.evaluator import Evaluator
from evalscope.models import initialize_model_adapter
from evalscope.report import ReportGenerator
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
from evalscope.utils.logger import get_logger
logger = get_logger()
class SimpleEvaluator(Evaluator):
def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
super().__init__(
dataset_name_or_path=dataset_name,
data_adapter=data_adapter,
model_adapter=model_adapter,
task_cfg=task_cfg,
outputs=outputs)
def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
input_prompts = [sample.prompt for sample in samples]
subset_name = samples[0].subset_name
try:
# get answer from model
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
except Exception as e:
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
# if ignore_errors is True, continue to next input
if self.task_cfg.ignore_errors:
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
return [None] * len(samples), samples
else:
raise e
# process answers
answers_list = []
for answer_d, input_prompt in zip(answer_ds, input_prompts):
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
answers_list.append(processed_answer)
return answers_list, samples
def get_review(self, answer_d) -> dict:
review_id, reviewer_spec = self._generate_review_id(answer_d)
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
return review_d
def get_score(self, review_d) -> float:
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
return metric_score
class EvaluatorCollection:
def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure, base_model):
self.task_cfg = task_cfg
self.data_adapter = data_adapter
self.outputs = outputs
self.model = base_model
self.dataset, self.dataset_name = self.load()
self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
self.evaluators = self._initialize_evaluators()
def load(self) -> tuple[List[DatasetEntry], str]:
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
raw_dataset = self.data_adapter.load()
# random limit the dataset
limit = len(raw_dataset)
if self.task_cfg.limit is not None:
if isinstance(self.task_cfg.limit, int):
limit = self.task_cfg.limit
elif isinstance(self.task_cfg.limit, float):
limit = int(len(raw_dataset) * self.task_cfg.limit)
raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
# index dataset
datasets = []
for sample in raw_dataset:
sample['prompt'].update({'index': sample['index']})
datasets.append(DatasetEntry(**sample))
return datasets, dataset_name
@staticmethod
def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
dataset_name_map = defaultdict(lambda: defaultdict(list))
for sample in dataset:
dataset_name, subset_name = sample.dataset_name, sample.subset_name
dataset_name_map[dataset_name][subset_name].append(sample.index)
return dataset_name_map
@staticmethod
def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
dataset_id_map = {}
for sample in dataset:
dataset_id_map[sample.index] = sample
return dataset_id_map
def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
evaluators = {}
# load dataset args
dataset_args = deepcopy(self.task_cfg.dataset_args)
common_args = dataset_args.get(DataCollection.NAME, {})
for dataset_name in self.dataset_name_map.keys():
benchmark = Benchmark.get(dataset_name)
model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
# update dataset args
cur_dataset_args = dataset_args.get(dataset_name, {})
cur_dataset_args.update(common_args)
# get data adapter
data_adapter = benchmark.get_data_adapter(cur_dataset_args)
evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
self.outputs)
return evaluators
def get_report(self, scores):
if not scores:
return
def get_dataframe(scores):
data = []
for dataset_name, data_map in self.dataset_name_map.items():
for subset_name, ids in data_map.items():
for _id in ids:
row_data: DatasetEntry = self.dataset_id_map[_id]
for metric in scores[_id]:
data.append(
dict(
task_type=row_data.task_type,
categories=tuple(row_data.categories),
dataset_name=dataset_name,
subset_name=subset_name,
tags=row_data.tags,
metric=metric['metric_name'],
score=metric['score']))
return pd.DataFrame(data)
def aggregate_and_sort(df, group_by_cols):
# aggregate by group_by_cols, and calculate average_score and count
report_df = df.groupby(group_by_cols) \
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
.reset_index()
report_df['average_score'] = report_df['average_score'].round(4)
report_df = report_df.sort_values(by='count', ascending=False) \
.to_dict(orient='records')
return report_df
df = get_dataframe(scores)
# multi-level aggregation
subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
# explode tags to multiple rows
df_exploded_tags = df.explode('tags')
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
# process multi-level categories
df_categories = df.copy()
# multi-level aggregation for categories
max_depth = df_categories['categories'].apply(len).max()
for level in range(max_depth):
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
if len(x) > level else '')
category_report_df = aggregate_and_sort(df_categories,
[f'category{level}' for level in range(max_depth)] + ['metric'])
# convert to dict format
report_dict = {
'subset_level': subset_report_df,
'dataset_level': dataset_report_df,
'task_level': task_report_df,
'tag_level': tag_report_df,
'category_level': category_report_df,
}
# record report
for level, data in report_dict.items():
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
logger.info(f'{level} Report:\n{table}')
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
# Make report analysis
if self.task_cfg.analysis_report:
logger.info('Generating report analysis, please wait ...')
analysis = report.generate_analysis(self.task_cfg.judge_model_args)
logger.info('Report analysis:\n%s', analysis)
else:
logger.info('Skipping report analysis (`analysis_report=False`).')
# save report to JSON file
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
report.to_json(report_file_path)
logger.info(f'Report saved to {report_file_path}')
return report
def _filter_answer(self, pred_file_path):
answer_dict = defaultdict(dict)
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
answers_list = jsonl_to_list(pred_file_path)
# Create a set of sample indices for which we have answers
indices = set()
for answer in answers_list:
index = answer.get(AnswerKeys.INDEX)
answer_dict[index] = answer
indices.add(index)
# Filter dataset to only include samples that don't have answers
data = [sample for sample in self.dataset if sample.index not in indices]
# Initialize name map for the filtered dataset
data_map = self._init_name_map(data)
logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
return answer_dict, data, data_map
else:
# If cache isn't enabled or file doesn't exist, return the full dataset
return answer_dict, self.dataset, self.dataset_name_map
def get_answers(self):
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
f'{self.dataset_name}.jsonl')
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
eval_batch_size = self.task_cfg.eval_batch_size
# Process samples and get answers
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
if self.task_cfg.eval_type == EvalType.SERVICE:
# Create a thread pool for parallel processing
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
futures = []
for sample in dataset:
evaluator = self.evaluators[sample.dataset_name]
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
# Process completed tasks
for future in as_completed(futures):
answer_list, samples = future.result()
for answer_d, sample in zip(answer_list, samples):
if answer_d is None:
continue
answers[sample.index] = answer_d
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
pbar.update(1)
else:
for dataset_name, data_map in dataset_name_map.items():
# get evaluator for the dataset
evaluator = self.evaluators[dataset_name]
for subset_name, ids in data_map.items():
for i in range(0, len(ids), eval_batch_size):
# get batch samples
batch_ids = ids[i:i + eval_batch_size]
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
# update answers
for answer_d, sample in zip(answer_list, samples):
if answer_d is None:
continue
answers[sample.index] = answer_d
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
pbar.update(1)
return answers
def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
"""
Retrieve or generate reviews for given answers.
Args:
answers: Dictionary of answers indexed by sample index.
Returns:
Dictionary of reviews indexed by sample index.
"""
# Set up the review file path
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
os.makedirs(review_file_path, exist_ok=True)
review_history_map = defaultdict(dict)
# Handle caching logic
if os.path.exists(review_file_path):
if not self.task_cfg.use_cache:
# Clear existing reviews if not using cache
self._clear_review_files(review_file_path)
else:
# Load existing reviews if using cache
self._load_existing_reviews(review_file_path, review_history_map)
reviews = {}
for sample in tqdm(self.dataset, desc='Getting reviews'):
try:
file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
# Use cached review if available
review_d = review_history_map[file_name][sample.index]
else:
# Generate new review
evaluator = self.evaluators[sample.dataset_name]
review_d = evaluator.get_review(answers[sample.index])
# Only save the review if it's not in the cache
self._save_review(review_file_path, file_name, review_d)
reviews[sample.index] = review_d
except Exception as e:
logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
return reviews
def _clear_review_files(self, review_file_path: str) -> None:
"""Clear existing review files."""
if os.path.isdir(review_file_path):
for filename in os.listdir(review_file_path):
file_path = os.path.join(review_file_path, filename)
try:
if os.path.isfile(file_path):
os.remove(file_path)
except Exception as e:
logger.error(f'Error deleting file {file_path}: {e}')
else:
os.remove(review_file_path)
def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
"""Load existing reviews from files."""
logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
if os.path.isdir(review_file_path):
for filename in os.listdir(review_file_path):
if '.ipynb_checkpoints' in filename:
continue
file_path = os.path.join(review_file_path, filename)
with open(file_path, 'r') as f:
review_history = [json.loads(line.strip()) for line in f]
review_history_map[filename] = {item['index']: item for item in review_history}
def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
"""Save a single review to file."""
file_path = os.path.join(review_file_path, file_name)
dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
def get_scores(self, reviews) -> float:
scores = defaultdict(dict)
for sample in tqdm(self.dataset, desc='Getting scores'):
evaluator = self.evaluators[sample.dataset_name]
if sample.index not in reviews:
continue
review_d = reviews[sample.index]
score = evaluator.get_score(review_d)
scores[sample.index] = score
return scores
def eval(self, **kwargs):
answers = self.get_answers()
reviews = self.get_reviews(answers)
scores = self.get_scores(reviews)
report = self.get_report(scores)
return report