376 lines
17 KiB
Python
376 lines
17 KiB
Python
import json
|
|
import os
|
|
import pandas as pd
|
|
import random
|
|
from collections import defaultdict
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from copy import deepcopy
|
|
from tabulate import tabulate
|
|
from tqdm import tqdm
|
|
from typing import Any, Dict, List
|
|
|
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.collections.sampler import DatasetEntry
|
|
from evalscope.config import TaskConfig
|
|
from evalscope.constants import AnswerKeys, DataCollection, DumpMode, EvalType
|
|
from evalscope.evaluator import Evaluator
|
|
from evalscope.models import initialize_model_adapter
|
|
from evalscope.report import ReportGenerator
|
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
class SimpleEvaluator(Evaluator):
|
|
|
|
def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
|
|
super().__init__(
|
|
dataset_name_or_path=dataset_name,
|
|
data_adapter=data_adapter,
|
|
model_adapter=model_adapter,
|
|
task_cfg=task_cfg,
|
|
outputs=outputs)
|
|
|
|
def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
|
|
input_prompts = [sample.prompt for sample in samples]
|
|
subset_name = samples[0].subset_name
|
|
try:
|
|
# get answer from model
|
|
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
except Exception as e:
|
|
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
|
|
# if ignore_errors is True, continue to next input
|
|
if self.task_cfg.ignore_errors:
|
|
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
return [None] * len(samples), samples
|
|
else:
|
|
raise e
|
|
# process answers
|
|
answers_list = []
|
|
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
answers_list.append(processed_answer)
|
|
return answers_list, samples
|
|
|
|
def get_review(self, answer_d) -> dict:
|
|
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
return review_d
|
|
|
|
def get_score(self, review_d) -> float:
|
|
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
return metric_score
|
|
|
|
|
|
class EvaluatorCollection:
|
|
|
|
def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure, base_model):
|
|
self.task_cfg = task_cfg
|
|
self.data_adapter = data_adapter
|
|
self.outputs = outputs
|
|
self.model = base_model
|
|
|
|
self.dataset, self.dataset_name = self.load()
|
|
self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
|
|
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
|
|
self.evaluators = self._initialize_evaluators()
|
|
|
|
def load(self) -> tuple[List[DatasetEntry], str]:
|
|
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
raw_dataset = self.data_adapter.load()
|
|
# random limit the dataset
|
|
limit = len(raw_dataset)
|
|
if self.task_cfg.limit is not None:
|
|
if isinstance(self.task_cfg.limit, int):
|
|
limit = self.task_cfg.limit
|
|
elif isinstance(self.task_cfg.limit, float):
|
|
limit = int(len(raw_dataset) * self.task_cfg.limit)
|
|
raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
|
|
# index dataset
|
|
datasets = []
|
|
for sample in raw_dataset:
|
|
sample['prompt'].update({'index': sample['index']})
|
|
datasets.append(DatasetEntry(**sample))
|
|
|
|
return datasets, dataset_name
|
|
|
|
@staticmethod
|
|
def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
|
|
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
for sample in dataset:
|
|
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
return dataset_name_map
|
|
|
|
@staticmethod
|
|
def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
|
|
dataset_id_map = {}
|
|
for sample in dataset:
|
|
dataset_id_map[sample.index] = sample
|
|
return dataset_id_map
|
|
|
|
def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
|
|
evaluators = {}
|
|
# load dataset args
|
|
dataset_args = deepcopy(self.task_cfg.dataset_args)
|
|
common_args = dataset_args.get(DataCollection.NAME, {})
|
|
for dataset_name in self.dataset_name_map.keys():
|
|
benchmark = Benchmark.get(dataset_name)
|
|
model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
|
|
# update dataset args
|
|
cur_dataset_args = dataset_args.get(dataset_name, {})
|
|
cur_dataset_args.update(common_args)
|
|
# get data adapter
|
|
data_adapter = benchmark.get_data_adapter(cur_dataset_args)
|
|
evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
|
|
self.outputs)
|
|
return evaluators
|
|
|
|
def get_report(self, scores):
|
|
if not scores:
|
|
return
|
|
|
|
def get_dataframe(scores):
|
|
data = []
|
|
for dataset_name, data_map in self.dataset_name_map.items():
|
|
for subset_name, ids in data_map.items():
|
|
for _id in ids:
|
|
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
for metric in scores[_id]:
|
|
data.append(
|
|
dict(
|
|
task_type=row_data.task_type,
|
|
categories=tuple(row_data.categories),
|
|
dataset_name=dataset_name,
|
|
subset_name=subset_name,
|
|
tags=row_data.tags,
|
|
metric=metric['metric_name'],
|
|
score=metric['score']))
|
|
return pd.DataFrame(data)
|
|
|
|
def aggregate_and_sort(df, group_by_cols):
|
|
# aggregate by group_by_cols, and calculate average_score and count
|
|
report_df = df.groupby(group_by_cols) \
|
|
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
|
|
.reset_index()
|
|
report_df['average_score'] = report_df['average_score'].round(4)
|
|
report_df = report_df.sort_values(by='count', ascending=False) \
|
|
.to_dict(orient='records')
|
|
return report_df
|
|
|
|
df = get_dataframe(scores)
|
|
|
|
# multi-level aggregation
|
|
subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
|
|
dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
|
|
task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
|
|
|
|
# explode tags to multiple rows
|
|
df_exploded_tags = df.explode('tags')
|
|
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
|
|
|
|
# process multi-level categories
|
|
df_categories = df.copy()
|
|
# multi-level aggregation for categories
|
|
max_depth = df_categories['categories'].apply(len).max()
|
|
for level in range(max_depth):
|
|
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
if len(x) > level else '')
|
|
category_report_df = aggregate_and_sort(df_categories,
|
|
[f'category{level}' for level in range(max_depth)] + ['metric'])
|
|
|
|
# convert to dict format
|
|
report_dict = {
|
|
'subset_level': subset_report_df,
|
|
'dataset_level': dataset_report_df,
|
|
'task_level': task_report_df,
|
|
'tag_level': tag_report_df,
|
|
'category_level': category_report_df,
|
|
}
|
|
|
|
# record report
|
|
for level, data in report_dict.items():
|
|
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
|
|
logger.info(f'{level} Report:\n{table}')
|
|
|
|
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
|
|
# Make report analysis
|
|
if self.task_cfg.analysis_report:
|
|
logger.info('Generating report analysis, please wait ...')
|
|
analysis = report.generate_analysis(self.task_cfg.judge_model_args)
|
|
logger.info('Report analysis:\n%s', analysis)
|
|
else:
|
|
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
|
|
# save report to JSON file
|
|
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
|
|
report.to_json(report_file_path)
|
|
|
|
logger.info(f'Report saved to {report_file_path}')
|
|
return report
|
|
|
|
def _filter_answer(self, pred_file_path):
|
|
answer_dict = defaultdict(dict)
|
|
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
|
|
answers_list = jsonl_to_list(pred_file_path)
|
|
# Create a set of sample indices for which we have answers
|
|
indices = set()
|
|
for answer in answers_list:
|
|
index = answer.get(AnswerKeys.INDEX)
|
|
answer_dict[index] = answer
|
|
indices.add(index)
|
|
|
|
# Filter dataset to only include samples that don't have answers
|
|
data = [sample for sample in self.dataset if sample.index not in indices]
|
|
|
|
# Initialize name map for the filtered dataset
|
|
data_map = self._init_name_map(data)
|
|
|
|
logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
|
|
return answer_dict, data, data_map
|
|
else:
|
|
# If cache isn't enabled or file doesn't exist, return the full dataset
|
|
return answer_dict, self.dataset, self.dataset_name_map
|
|
|
|
def get_answers(self):
|
|
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
f'{self.dataset_name}.jsonl')
|
|
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
|
|
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
|
|
|
|
eval_batch_size = self.task_cfg.eval_batch_size
|
|
# Process samples and get answers
|
|
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
|
|
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
# Create a thread pool for parallel processing
|
|
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
futures = []
|
|
for sample in dataset:
|
|
evaluator = self.evaluators[sample.dataset_name]
|
|
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
|
|
# Process completed tasks
|
|
for future in as_completed(futures):
|
|
answer_list, samples = future.result()
|
|
for answer_d, sample in zip(answer_list, samples):
|
|
if answer_d is None:
|
|
continue
|
|
answers[sample.index] = answer_d
|
|
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
pbar.update(1)
|
|
else:
|
|
for dataset_name, data_map in dataset_name_map.items():
|
|
# get evaluator for the dataset
|
|
evaluator = self.evaluators[dataset_name]
|
|
for subset_name, ids in data_map.items():
|
|
for i in range(0, len(ids), eval_batch_size):
|
|
# get batch samples
|
|
batch_ids = ids[i:i + eval_batch_size]
|
|
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
|
|
answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
|
|
# update answers
|
|
for answer_d, sample in zip(answer_list, samples):
|
|
if answer_d is None:
|
|
continue
|
|
answers[sample.index] = answer_d
|
|
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
pbar.update(1)
|
|
return answers
|
|
|
|
def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
|
|
"""
|
|
Retrieve or generate reviews for given answers.
|
|
|
|
Args:
|
|
answers: Dictionary of answers indexed by sample index.
|
|
|
|
Returns:
|
|
Dictionary of reviews indexed by sample index.
|
|
"""
|
|
# Set up the review file path
|
|
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
os.makedirs(review_file_path, exist_ok=True)
|
|
|
|
review_history_map = defaultdict(dict)
|
|
|
|
# Handle caching logic
|
|
if os.path.exists(review_file_path):
|
|
if not self.task_cfg.use_cache:
|
|
# Clear existing reviews if not using cache
|
|
self._clear_review_files(review_file_path)
|
|
else:
|
|
# Load existing reviews if using cache
|
|
self._load_existing_reviews(review_file_path, review_history_map)
|
|
|
|
reviews = {}
|
|
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
try:
|
|
file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
|
|
|
|
if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
|
|
# Use cached review if available
|
|
review_d = review_history_map[file_name][sample.index]
|
|
else:
|
|
# Generate new review
|
|
evaluator = self.evaluators[sample.dataset_name]
|
|
review_d = evaluator.get_review(answers[sample.index])
|
|
# Only save the review if it's not in the cache
|
|
self._save_review(review_file_path, file_name, review_d)
|
|
|
|
reviews[sample.index] = review_d
|
|
except Exception as e:
|
|
logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
|
|
|
|
return reviews
|
|
|
|
def _clear_review_files(self, review_file_path: str) -> None:
|
|
"""Clear existing review files."""
|
|
if os.path.isdir(review_file_path):
|
|
for filename in os.listdir(review_file_path):
|
|
file_path = os.path.join(review_file_path, filename)
|
|
try:
|
|
if os.path.isfile(file_path):
|
|
os.remove(file_path)
|
|
except Exception as e:
|
|
logger.error(f'Error deleting file {file_path}: {e}')
|
|
else:
|
|
os.remove(review_file_path)
|
|
|
|
def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
|
|
"""Load existing reviews from files."""
|
|
logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
|
|
if os.path.isdir(review_file_path):
|
|
for filename in os.listdir(review_file_path):
|
|
if '.ipynb_checkpoints' in filename:
|
|
continue
|
|
file_path = os.path.join(review_file_path, filename)
|
|
with open(file_path, 'r') as f:
|
|
review_history = [json.loads(line.strip()) for line in f]
|
|
review_history_map[filename] = {item['index']: item for item in review_history}
|
|
|
|
def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
|
|
"""Save a single review to file."""
|
|
file_path = os.path.join(review_file_path, file_name)
|
|
dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
|
|
|
|
def get_scores(self, reviews) -> float:
|
|
scores = defaultdict(dict)
|
|
for sample in tqdm(self.dataset, desc='Getting scores'):
|
|
evaluator = self.evaluators[sample.dataset_name]
|
|
if sample.index not in reviews:
|
|
continue
|
|
review_d = reviews[sample.index]
|
|
score = evaluator.get_score(review_d)
|
|
scores[sample.index] = score
|
|
|
|
return scores
|
|
|
|
def eval(self, **kwargs):
|
|
answers = self.get_answers()
|
|
reviews = self.get_reviews(answers)
|
|
scores = self.get_scores(reviews)
|
|
report = self.get_report(scores)
|
|
return report
|