# Copyright (c) Alibaba, Inc. and its affiliates. import os.path import random from abc import ABC, abstractmethod from collections import defaultdict from typing import Any, Dict, List, Optional, Union from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType from evalscope.metrics import LLMJudge, metric_registry from evalscope.report import Report, ReportGenerator from evalscope.utils.logger import get_logger logger = get_logger() class DataAdapter(ABC): """ Data Adapter for the benchmark. You need to implement the following methods: - gen_prompt - get_gold_answer - parse_pred_result - match """ def __init__(self, name: str, dataset_id: str, model_adapter: str, subset_list: list, metric_list: List[str], llm_as_a_judge: bool = False, output_types: Optional[List[str]] = None, few_shot_num: Optional[int] = 0, train_split: Optional[str] = None, eval_split: Optional[str] = None, prompt_template: Optional[str] = None, system_prompt: Optional[str] = None, query_template: Optional[str] = None, pretty_name: Optional[str] = None, description: Optional[str] = None, tags: Optional[List[str]] = None, **kwargs): """ Args: name: str, the name of the benchmark. dataset_id: str, the dataset id on ModelScope or local path for the benchmark. model_adapter: str, the model adapter to use for the benchmark. subset_list: list of subset names for the dataset. metric_list: list, the metric list to evaluate the model on specific benchmark. llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer. output_types: list, the output types of the model adapter. Default: [model_adapter] few_shot_num: int, number of few-shot examples. Default: 0 train_split: str, usually for few-shot examples. e.g. 'train' eval_split: str, the target eval split name. e.g. 'test' prompt_template: str, the prompt template for the benchmark, e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:` system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.' query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}' pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'. description: str, the description of the benchmark, e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.' """ # noqa: E501 self.name = name self.dataset_id = dataset_id self.model_adapter = model_adapter self.subset_list = subset_list self.metric_list = metric_list self.llm_as_a_judge = llm_as_a_judge self.output_types = output_types or [model_adapter] self.few_shot_num = few_shot_num self.train_split = train_split self.eval_split = eval_split self.prompt_template = prompt_template self.system_prompt = system_prompt self.query_template = query_template self.pretty_name = pretty_name self.description = description self.tags = tags or [] self.config_kwargs = kwargs self.category_map = kwargs.get('category_map', {}) self.choices = kwargs.get('choices', None) def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) # find and decorate parse_pred_result method if hasattr(cls, 'parse_pred_result'): original_method = cls.parse_pred_result cls.parse_pred_result = preprocess_decorator(original_method) def load(self, dataset_name_or_path: str = None, subset_list: list = None, work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR, **kwargs) -> dict: """ Load the dataset. Remote and local datasets are supported. You can rewrite this method to support your own local dataset, just follow the format of the output. Returns: {'subset_name': {'train': train_dataset, 'test': test_dataset}} train_dataset, test_dataset: Iterable dataset, object each item of which is a dict. """ dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id) subset_list = subset_list or self.subset_list # Try to load dataset from local disk if os.path.exists(dataset_name_or_path): logger.info(f'Loading dataset from local disk: {dataset_name_or_path}') trust_remote_code = kwargs.pop('trust_remote_code', False) data_dict = self.load_from_disk( dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs) else: logger.info(f'Loading dataset from hub: {dataset_name_or_path}') trust_remote_code = kwargs.pop('trust_remote_code', True) data_dict = self.load_from_hub( dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs) if len(data_dict) == 0: raise ValueError(f'Dataset is empty: {dataset_name_or_path}') return data_dict def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict: from modelscope.msdatasets import MsDataset datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE) split_as_subset: bool = kwargs.pop('split_as_subset', False) # Load dataset from remote logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}') data_dict = {} split_list = [split for split in [self.train_split, self.eval_split] if split is not None] if len(split_list) == 0: logger.error(f'Got empty split list: {split_list}') if split_as_subset: for sub_name in subset_list: data_dict[sub_name] = {} # e.g. train: few-shot, test: target dataset to evaluate for split in split_list: dataset = MsDataset.load( dataset_name=dataset_name_or_path, split=sub_name, # load subset from split cache_dir=work_dir, hub=datasets_hub, **kwargs) data_dict[sub_name].update({split: dataset}) else: for sub_name in subset_list: data_dict[sub_name] = {} # e.g. train: few-shot, test: target dataset to evaluate for split in split_list: dataset = MsDataset.load( dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, hub=datasets_hub, **kwargs) data_dict[sub_name].update({split: dataset}) return data_dict def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: """ Load the dataset from local disk. If you want to support local dataset, please rewrite this method in xxx_data_adapter. Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default. """ return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs) def load_with_snapshot(self, file_structure: Dict[str, List[str]], dataset_name_or_path: str = None, subset_list: list = None, work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR, **kwargs) -> dict: """ For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data. This feature supports both remote and local datasets. Args: file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}. dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark. subset_list: list of subset names for the dataset. work_dir: str, the working directory to store the dataset. Returns: {'subset_name': {'eval': eval_dataset}} """ # noqa: E501 dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id) subset_list = subset_list or self.subset_list # Try to load dataset from local disk if os.path.exists(dataset_name_or_path): logger.info(f'Loading dataset from {dataset_name_or_path}') dataset_path = dataset_name_or_path else: from modelscope import dataset_snapshot_download # Load dataset from remote logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}') # flatten file structure file_names = [file for sub_files in file_structure.values() for file in sub_files] # download dataset snapshot dataset_path = dataset_snapshot_download( dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names) # read and process files data_dict = defaultdict(dict) for sub_name in subset_list: file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]] # not train split, only eval split data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths) return data_dict def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict: """ Reformat the dataset subset with subset_key and format. """ res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list}) for sub_name, sub_data_dict in data_dict.items(): for split in [self.train_split, self.eval_split]: if split is None: continue for sample_d in sub_data_dict[split]: new_subset_name = format.format(sample_d[subset_key]) if new_subset_name not in self.subset_list: continue res_dict[new_subset_name][split].append(sample_d) return res_dict def gen_prompts(self, data_dict: dict) -> dict: """ Generate dataset prompts from raw input, unify the prompt format for different datasets. Args: data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load Returns: {'subset_name': [prompt_d_1, prompt_d_2, ...]} prompt_d_i (dict): refer to the output of gen_prompt method. e.g. train -- few-shot data, test -- target dataset to evaluate. """ res_dict: dict = {} if self.few_shot_num and self.few_shot_num < 0: raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.') logger.info(f'Use settings: ' f'> few_shot_num: {self.few_shot_num}, ' f'> few_shot_split: {self.train_split}, ' f'> target_eval_split: {self.eval_split}') for sub_name, sub_data_dict in data_dict.items(): few_shot_data = [] if self.train_split and self.few_shot_num and self.few_shot_num > 0: few_shot_random: bool = self.config_kwargs.get('few_shot_random', True) few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]], self.few_shot_num, few_shot_random=few_shot_random) res_dict[sub_name] = [] for sample_d in sub_data_dict[self.eval_split]: prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data) prompt_d[AnswerKeys.RAW_INPUT] = sample_d res_dict[sub_name].append(prompt_d) return res_dict def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True): if k > len(data_list): k = len(data_list) if few_shot_random: return random.sample(data_list, k) else: return data_list[:k] def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]: """ Compute evaluation result by specific metrics. Args: review_res_list: list, the review result list, each item of which is match result for gold and pred. Returns: Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}] """ if len(self.metric_list) == 0: raise ValueError('No metric list found for the benchmark.') res_list = [] for metric_str in self.metric_list: metric = metric_registry.get(metric_str) metric_name = metric.name metric_func = metric.object if isinstance(review_res_list, dict): review_res = review_res_list.get(metric_name, []) else: review_res = review_res_list res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)}) return res_list def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> Dict[str, List[float]]: """ compute weighted mean of score of all samples Args: review_res_list: [score1, score2, ...] Returns: avg_res: Dict[str, List[float]] """ if len(review_res_list) > 0 and isinstance(review_res_list[0], list): review_res_list = [item for sublist in review_res_list for item in sublist] items = defaultdict(list) for scores in review_res_list: if isinstance(scores, dict): for k, v in scores.items(): items[k].append(v) else: items['AverageAccuracy'].append(scores) return items def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report: """ Generate report for the evaluation results for all subsets. Args: subset_score_map: The subset-score map. e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]} model_name: The evaluation model name. Returns: The evaluation report. Here is a format example for gsm8k: { "name": "qwen2.5_gsm8k", "metrics": [ { "name": "AverageAccuracy", "categories": [ { "name": "default", "subsets": [ { "name": "main", "score": 0.0, "num": 2 } ], "num": 2, "score": 0.0, "macro_score": 0.0 } ], "num": 2, "score": 0.0, "macro_score": 0.0 } ], "dataset_name": "gsm8k", "model_name": "qwen2.5" } """ # noqa: E501 return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs) def post_process_report(self, report: Report, **kwargs): """ Post-process the report after generation. Draw a chart, save to file, etc. This method can be overridden to customize the report format or content. Args: report (Report): The generated report. """ pass def gen_prompt_data(self, prompt: str, system_prompt: Optional[str] = None, choices: Optional[List[str]] = None, index: Optional[Union[int, str]] = None, id: Optional[Union[int, str]] = None, messages: Optional[List[dict]] = None, **kwargs) -> dict: """ Generates a dictionary representation of prompt data for evaluation or inference. Args: prompt (str): The main prompt or input text. Can also be a list of prompts. system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None. choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks. If not provided, uses self.choices. Defaults to None. index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt. Defaults to 0 if not provided. Defaults to None. id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None. messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None. If messages is provided, it will be used as the prompt data instead of the prompt string. Returns: dict: A dictionary representation of the prompt data, suitable for further processing or model input. """ # noqa: E501 data = [prompt] if not isinstance(prompt, list) else prompt prompt_data = PromptData( data=data, multi_choices=choices or self.choices, system_prompt=system_prompt or self.system_prompt, index=index or 0, id=id, messages=messages) return prompt_data.to_dict() def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any: """ Generate model prompt from raw input, unify the prompt format for different datasets. The input format is compatible with OpenAI Chat Completions APIs. Args: input_d (Any): The raw input. Depending on the dataset. subset_name (str): The subset name. few_shot_list (list): The few-shot examples. Returns: For class ChatGenerationModelAdapter, the output format is: {'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset. For class MultiChoiceModelAdapter, the output format is: {'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset. For class ContinuationEvalModelAdapter, the output format is: {'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list. """ # noqa: E501 raise NotImplementedError @abstractmethod def get_gold_answer(self, input_d: Any) -> Any: """ Parse the raw input labels (gold). Args: input_d: input raw data. Depending on the dataset. Returns: The parsed input. e.g. gold answer ... Depending on the dataset. """ raise NotImplementedError def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any: """ Parse the predicted result and extract proper answer. Args: result: Predicted answer from the model. Usually a string for chat. raw_input_d: The raw input. Depending on the dataset. eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint' Returns: The parsed answer. Depending on the dataset. Usually a string for chat. """ return result def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any: """ Parse the predicted result using LLM. Args: result (Any): The predicted answer from the model. raw_input_d (dict): The raw input data. eval_type (str): The evaluation type, default is 'checkpoint'. Returns: The parsed answer. Usually a string for chat. """ return result @abstractmethod def match(self, gold: Any, pred: Any) -> Any: """ Match the gold answer and the predicted answer. Args: gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions. e.g. 'A', extracted from get_gold_answer method. pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions. e.g. 'B', extracted from parse_pred_result method. Returns: The match result. Usually a score (float) for chat/multiple-choice-questions. """ raise NotImplementedError def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float: """ Use LLM as a judge to evaluate the predicted answer against the gold answer. Args: gold (Any): The golden answer. pred (Any): The predicted answer. Returns: The match result as a float score between 0 and 1. """ # Default judge handling if judge is None: logger.warning('No judge LLM provided, please specify a judge LLM in the config.') return 0 # Extract question from raw_input if available raw_input = kwargs.get('raw_input', {}) question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem'] # Find the first non-empty question key in raw_input question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None) # Request judge and obtain score prompt = judge.build_prompt(pred, gold, question) judge_response = judge(prompt) score = judge.get_score(judge_response) return score