evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/data_adapter.py

524 lines
23 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path
import random
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Any, Dict, List, Optional, Union
from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
from evalscope.metrics import LLMJudge, metric_registry
from evalscope.report import Report, ReportGenerator
from evalscope.utils.logger import get_logger
logger = get_logger()
class DataAdapter(ABC):
"""
Data Adapter for the benchmark. You need to implement the following methods:
- gen_prompt
- get_gold_answer
- parse_pred_result
- match
"""
def __init__(self,
name: str,
dataset_id: str,
model_adapter: str,
subset_list: list,
metric_list: List[str],
llm_as_a_judge: bool = False,
output_types: Optional[List[str]] = None,
few_shot_num: Optional[int] = 0,
train_split: Optional[str] = None,
eval_split: Optional[str] = None,
prompt_template: Optional[str] = None,
system_prompt: Optional[str] = None,
query_template: Optional[str] = None,
pretty_name: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[List[str]] = None,
**kwargs):
"""
Args:
name: str, the name of the benchmark.
dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
model_adapter: str, the model adapter to use for the benchmark.
subset_list: list of subset names for the dataset.
metric_list: list, the metric list to evaluate the model on specific benchmark.
llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer.
output_types: list, the output types of the model adapter. Default: [model_adapter]
few_shot_num: int, number of few-shot examples. Default: 0
train_split: str, usually for few-shot examples. e.g. 'train'
eval_split: str, the target eval split name. e.g. 'test'
prompt_template: str, the prompt template for the benchmark,
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
the form of A or B or C or D, do not output explanation:`
system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.'
query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}'
pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'.
description: str, the description of the benchmark,
e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.'
""" # noqa: E501
self.name = name
self.dataset_id = dataset_id
self.model_adapter = model_adapter
self.subset_list = subset_list
self.metric_list = metric_list
self.llm_as_a_judge = llm_as_a_judge
self.output_types = output_types or [model_adapter]
self.few_shot_num = few_shot_num
self.train_split = train_split
self.eval_split = eval_split
self.prompt_template = prompt_template
self.system_prompt = system_prompt
self.query_template = query_template
self.pretty_name = pretty_name
self.description = description
self.tags = tags or []
self.config_kwargs = kwargs
self.category_map = kwargs.get('category_map', {})
self.choices = kwargs.get('choices', None)
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
# find and decorate parse_pred_result method
if hasattr(cls, 'parse_pred_result'):
original_method = cls.parse_pred_result
cls.parse_pred_result = preprocess_decorator(original_method)
def load(self,
dataset_name_or_path: str = None,
subset_list: list = None,
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
**kwargs) -> dict:
"""
Load the dataset. Remote and local datasets are supported.
You can rewrite this method to support your own local dataset, just follow the format of the output.
Returns: {'subset_name': {'train': train_dataset, 'test': test_dataset}}
train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
"""
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
subset_list = subset_list or self.subset_list
# Try to load dataset from local disk
if os.path.exists(dataset_name_or_path):
logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
trust_remote_code = kwargs.pop('trust_remote_code', False)
data_dict = self.load_from_disk(
dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
else:
logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
trust_remote_code = kwargs.pop('trust_remote_code', True)
data_dict = self.load_from_hub(
dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
if len(data_dict) == 0:
raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
return data_dict
def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
from modelscope.msdatasets import MsDataset
datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
split_as_subset: bool = kwargs.pop('split_as_subset', False)
# Load dataset from remote
logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
data_dict = {}
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
if len(split_list) == 0:
logger.error(f'Got empty split list: {split_list}')
if split_as_subset:
for sub_name in subset_list:
data_dict[sub_name] = {}
# e.g. train: few-shot, test: target dataset to evaluate
for split in split_list:
dataset = MsDataset.load(
dataset_name=dataset_name_or_path,
split=sub_name, # load subset from split
cache_dir=work_dir,
hub=datasets_hub,
**kwargs)
data_dict[sub_name].update({split: dataset})
else:
for sub_name in subset_list:
data_dict[sub_name] = {}
# e.g. train: few-shot, test: target dataset to evaluate
for split in split_list:
dataset = MsDataset.load(
dataset_name=dataset_name_or_path,
subset_name=sub_name,
split=split,
cache_dir=work_dir,
hub=datasets_hub,
**kwargs)
data_dict[sub_name].update({split: dataset})
return data_dict
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
"""
Load the dataset from local disk.
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
"""
return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
def load_with_snapshot(self,
file_structure: Dict[str, List[str]],
dataset_name_or_path: str = None,
subset_list: list = None,
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
**kwargs) -> dict:
"""
For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data.
This feature supports both remote and local datasets.
Args:
file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}.
dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark.
subset_list: list of subset names for the dataset.
work_dir: str, the working directory to store the dataset.
Returns: {'subset_name': {'eval': eval_dataset}}
""" # noqa: E501
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
subset_list = subset_list or self.subset_list
# Try to load dataset from local disk
if os.path.exists(dataset_name_or_path):
logger.info(f'Loading dataset from {dataset_name_or_path}')
dataset_path = dataset_name_or_path
else:
from modelscope import dataset_snapshot_download
# Load dataset from remote
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
# flatten file structure
file_names = [file for sub_files in file_structure.values() for file in sub_files]
# download dataset snapshot
dataset_path = dataset_snapshot_download(
dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names)
# read and process files
data_dict = defaultdict(dict)
for sub_name in subset_list:
file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]]
# not train split, only eval split
data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths)
return data_dict
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
"""
Reformat the dataset subset with subset_key and format.
"""
res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})
for sub_name, sub_data_dict in data_dict.items():
for split in [self.train_split, self.eval_split]:
if split is None:
continue
for sample_d in sub_data_dict[split]:
new_subset_name = format.format(sample_d[subset_key])
if new_subset_name not in self.subset_list:
continue
res_dict[new_subset_name][split].append(sample_d)
return res_dict
def gen_prompts(self, data_dict: dict) -> dict:
"""
Generate dataset prompts from raw input, unify the prompt format for different datasets.
Args:
data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
Returns:
{'subset_name': [prompt_d_1, prompt_d_2, ...]}
prompt_d_i (dict): refer to the output of gen_prompt method.
e.g. train -- few-shot data, test -- target dataset to evaluate.
"""
res_dict: dict = {}
if self.few_shot_num and self.few_shot_num < 0:
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
logger.info(f'Use settings: '
f'> few_shot_num: {self.few_shot_num}, '
f'> few_shot_split: {self.train_split}, '
f'> target_eval_split: {self.eval_split}')
for sub_name, sub_data_dict in data_dict.items():
few_shot_data = []
if self.train_split and self.few_shot_num and self.few_shot_num > 0:
few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
self.few_shot_num,
few_shot_random=few_shot_random)
res_dict[sub_name] = []
for sample_d in sub_data_dict[self.eval_split]:
prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
res_dict[sub_name].append(prompt_d)
return res_dict
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
if k > len(data_list):
k = len(data_list)
if few_shot_random:
return random.sample(data_list, k)
else:
return data_list[:k]
def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
"""
Compute evaluation result by specific metrics.
Args:
review_res_list: list, the review result list, each item of which is match result for gold and pred.
Returns:
Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
"""
if len(self.metric_list) == 0:
raise ValueError('No metric list found for the benchmark.')
res_list = []
for metric_str in self.metric_list:
metric = metric_registry.get(metric_str)
metric_name = metric.name
metric_func = metric.object
if isinstance(review_res_list, dict):
review_res = review_res_list.get(metric_name, [])
else:
review_res = review_res_list
res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
return res_list
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
**kwargs) -> Dict[str, List[float]]:
"""
compute weighted mean of score of all samples
Args:
review_res_list: [score1, score2, ...]
Returns:
avg_res: Dict[str, List[float]]
"""
if len(review_res_list) > 0 and isinstance(review_res_list[0], list):
review_res_list = [item for sublist in review_res_list for item in sublist]
items = defaultdict(list)
for scores in review_res_list:
if isinstance(scores, dict):
for k, v in scores.items():
items[k].append(v)
else:
items['AverageAccuracy'].append(scores)
return items
def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report:
"""
Generate report for the evaluation results for all subsets.
Args:
subset_score_map: The subset-score map.
e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
model_name: The evaluation model name.
Returns: The evaluation report.
Here is a format example for gsm8k:
{
"name": "qwen2.5_gsm8k",
"metrics": [
{
"name": "AverageAccuracy",
"categories": [
{
"name": "default",
"subsets": [
{
"name": "main",
"score": 0.0,
"num": 2
}
],
"num": 2,
"score": 0.0,
"macro_score": 0.0
}
],
"num": 2,
"score": 0.0,
"macro_score": 0.0
}
],
"dataset_name": "gsm8k",
"model_name": "qwen2.5"
}
""" # noqa: E501
return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs)
def post_process_report(self, report: Report, **kwargs):
"""
Post-process the report after generation. Draw a chart, save to file, etc.
This method can be overridden to customize the report format or content.
Args:
report (Report): The generated report.
"""
pass
def gen_prompt_data(self,
prompt: str,
system_prompt: Optional[str] = None,
choices: Optional[List[str]] = None,
index: Optional[Union[int, str]] = None,
id: Optional[Union[int, str]] = None,
messages: Optional[List[dict]] = None,
**kwargs) -> dict:
"""
Generates a dictionary representation of prompt data for evaluation or inference.
Args:
prompt (str): The main prompt or input text. Can also be a list of prompts.
system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None.
choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks.
If not provided, uses self.choices. Defaults to None.
index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt.
Defaults to 0 if not provided. Defaults to None.
id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None.
messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None.
If messages is provided, it will be used as the prompt data instead of the prompt string.
Returns:
dict: A dictionary representation of the prompt data, suitable for further processing or model input.
""" # noqa: E501
data = [prompt] if not isinstance(prompt, list) else prompt
prompt_data = PromptData(
data=data,
multi_choices=choices or self.choices,
system_prompt=system_prompt or self.system_prompt,
index=index or 0,
id=id,
messages=messages)
return prompt_data.to_dict()
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
"""
Generate model prompt from raw input, unify the prompt format for different datasets.
The input format is compatible with OpenAI Chat Completions APIs.
Args:
input_d (Any): The raw input. Depending on the dataset.
subset_name (str): The subset name.
few_shot_list (list): The few-shot examples.
Returns:
For class ChatGenerationModelAdapter, the output format is:
{'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
For class MultiChoiceModelAdapter, the output format is:
{'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
For class ContinuationEvalModelAdapter, the output format is:
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
""" # noqa: E501
raise NotImplementedError
@abstractmethod
def get_gold_answer(self, input_d: Any) -> Any:
"""
Parse the raw input labels (gold).
Args:
input_d: input raw data. Depending on the dataset.
Returns:
The parsed input. e.g. gold answer ... Depending on the dataset.
"""
raise NotImplementedError
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
"""
Parse the predicted result and extract proper answer.
Args:
result: Predicted answer from the model. Usually a string for chat.
raw_input_d: The raw input. Depending on the dataset.
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
Returns:
The parsed answer. Depending on the dataset. Usually a string for chat.
"""
return result
def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
"""
Parse the predicted result using LLM.
Args:
result (Any): The predicted answer from the model.
raw_input_d (dict): The raw input data.
eval_type (str): The evaluation type, default is 'checkpoint'.
Returns:
The parsed answer. Usually a string for chat.
"""
return result
@abstractmethod
def match(self, gold: Any, pred: Any) -> Any:
"""
Match the gold answer and the predicted answer.
Args:
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
e.g. 'A', extracted from get_gold_answer method.
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
e.g. 'B', extracted from parse_pred_result method.
Returns:
The match result. Usually a score (float) for chat/multiple-choice-questions.
"""
raise NotImplementedError
def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
"""
Use LLM as a judge to evaluate the predicted answer against the gold answer.
Args:
gold (Any): The golden answer.
pred (Any): The predicted answer.
Returns:
The match result as a float score between 0 and 1.
"""
# Default judge handling
if judge is None:
logger.warning('No judge LLM provided, please specify a judge LLM in the config.')
return 0
# Extract question from raw_input if available
raw_input = kwargs.get('raw_input', {})
question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
# Find the first non-empty question key in raw_input
question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
# Request judge and obtain score
prompt = judge.build_prompt(pred, gold, question)
judge_response = judge(prompt)
score = judge.get_score(judge_response)
return score