evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/data_adapter.py

# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path
import random
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Any, Dict, List, Optional, Union

from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
from evalscope.metrics import LLMJudge, metric_registry
from evalscope.report import Report, ReportGenerator
from evalscope.utils.logger import get_logger

logger = get_logger()


class DataAdapter(ABC):
    """
    Data Adapter for the benchmark. You need to implement the following methods:
        - gen_prompt
        - get_gold_answer
        - parse_pred_result
        - match
    """

    def __init__(self,
                 name: str,
                 dataset_id: str,
                 model_adapter: str,
                 subset_list: list,
                 metric_list: List[str],
                 llm_as_a_judge: bool = False,
                 output_types: Optional[List[str]] = None,
                 few_shot_num: Optional[int] = 0,
                 train_split: Optional[str] = None,
                 eval_split: Optional[str] = None,
                 prompt_template: Optional[str] = None,
                 system_prompt: Optional[str] = None,
                 query_template: Optional[str] = None,
                 pretty_name: Optional[str] = None,
                 description: Optional[str] = None,
                 tags: Optional[List[str]] = None,
                 **kwargs):
        """
        Args:
            name: str, the name of the benchmark.
            dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
            model_adapter: str, the model adapter to use for the benchmark.
            subset_list: list of subset names for the dataset.
            metric_list: list, the metric list to evaluate the model on specific benchmark.
            llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer.
            output_types: list, the output types of the model adapter. Default: [model_adapter]
            few_shot_num: int, number of few-shot examples. Default: 0
            train_split: str, usually for few-shot examples. e.g. 'train'
            eval_split: str, the target eval split name. e.g. 'test'
            prompt_template: str, the prompt template for the benchmark,
                e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
                    the form of A or B or C or D, do not output explanation:`
            system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.'
            query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}'
            pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'.
            description: str, the description of the benchmark,
                e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.'
        """  # noqa: E501
        self.name = name
        self.dataset_id = dataset_id
        self.model_adapter = model_adapter
        self.subset_list = subset_list
        self.metric_list = metric_list
        self.llm_as_a_judge = llm_as_a_judge
        self.output_types = output_types or [model_adapter]
        self.few_shot_num = few_shot_num
        self.train_split = train_split
        self.eval_split = eval_split
        self.prompt_template = prompt_template
        self.system_prompt = system_prompt
        self.query_template = query_template
        self.pretty_name = pretty_name
        self.description = description
        self.tags = tags or []
        self.config_kwargs = kwargs
        self.category_map = kwargs.get('category_map', {})
        self.choices = kwargs.get('choices', None)

    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)

        # find and decorate parse_pred_result method
        if hasattr(cls, 'parse_pred_result'):
            original_method = cls.parse_pred_result
            cls.parse_pred_result = preprocess_decorator(original_method)

    def load(self,
             dataset_name_or_path: str = None,
             subset_list: list = None,
             work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
             **kwargs) -> dict:
        """
        Load the dataset. Remote and local datasets are supported.
        You can rewrite this method to support your own local dataset, just follow the format of the output.

        Returns: {'subset_name': {'train': train_dataset, 'test': test_dataset}}
            train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.

        """
        dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
        subset_list = subset_list or self.subset_list

        # Try to load dataset from local disk
        if os.path.exists(dataset_name_or_path):
            logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
            trust_remote_code = kwargs.pop('trust_remote_code', False)
            data_dict = self.load_from_disk(
                dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
        else:
            logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
            trust_remote_code = kwargs.pop('trust_remote_code', True)
            data_dict = self.load_from_hub(
                dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
        if len(data_dict) == 0:
            raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
        return data_dict

    def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
        from modelscope.msdatasets import MsDataset

        datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
        split_as_subset: bool = kwargs.pop('split_as_subset', False)
        # Load dataset from remote
        logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')

        data_dict = {}
        split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
        if len(split_list) == 0:
            logger.error(f'Got empty split list: {split_list}')

        if split_as_subset:
            for sub_name in subset_list:
                data_dict[sub_name] = {}
                # e.g. train: few-shot, test: target dataset to evaluate
                for split in split_list:
                    dataset = MsDataset.load(
                        dataset_name=dataset_name_or_path,
                        split=sub_name,  # load subset from split
                        cache_dir=work_dir,
                        hub=datasets_hub,
                        **kwargs)
                    data_dict[sub_name].update({split: dataset})
        else:
            for sub_name in subset_list:
                data_dict[sub_name] = {}
                # e.g. train: few-shot, test: target dataset to evaluate
                for split in split_list:
                    dataset = MsDataset.load(
                        dataset_name=dataset_name_or_path,
                        subset_name=sub_name,
                        split=split,
                        cache_dir=work_dir,
                        hub=datasets_hub,
                        **kwargs)
                    data_dict[sub_name].update({split: dataset})

        return data_dict

    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
        """
        Load the dataset from local disk.
        If you want to support local dataset, please rewrite this method in xxx_data_adapter.
        Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
        """
        return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)

    def load_with_snapshot(self,
                           file_structure: Dict[str, List[str]],
                           dataset_name_or_path: str = None,
                           subset_list: list = None,
                           work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
                           **kwargs) -> dict:
        """
        For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data.
        This feature supports both remote and local datasets.

        Args:
            file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}.
            dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark.
            subset_list: list of subset names for the dataset.
            work_dir: str, the working directory to store the dataset.
        Returns: {'subset_name': {'eval': eval_dataset}}
        """  # noqa: E501
        dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
        subset_list = subset_list or self.subset_list

        # Try to load dataset from local disk
        if os.path.exists(dataset_name_or_path):
            logger.info(f'Loading dataset from {dataset_name_or_path}')
            dataset_path = dataset_name_or_path
        else:
            from modelscope import dataset_snapshot_download

            # Load dataset from remote
            logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
            # flatten file structure
            file_names = [file for sub_files in file_structure.values() for file in sub_files]
            # download dataset snapshot
            dataset_path = dataset_snapshot_download(
                dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names)
        # read and process files
        data_dict = defaultdict(dict)
        for sub_name in subset_list:
            file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]]
            # not train split, only eval split
            data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths)

        return data_dict

    def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
        """
        Reformat the dataset subset with subset_key and format.
        """
        res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})

        for sub_name, sub_data_dict in data_dict.items():
            for split in [self.train_split, self.eval_split]:
                if split is None:
                    continue
                for sample_d in sub_data_dict[split]:
                    new_subset_name = format.format(sample_d[subset_key])
                    if new_subset_name not in self.subset_list:
                        continue
                    res_dict[new_subset_name][split].append(sample_d)
        return res_dict

    def gen_prompts(self, data_dict: dict) -> dict:
        """
        Generate dataset prompts from raw input, unify the prompt format for different datasets.

        Args:
            data_dict:  Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load

        Returns:
            {'subset_name': [prompt_d_1, prompt_d_2, ...]}
            prompt_d_i (dict): refer to the output of gen_prompt method.

        e.g. train -- few-shot data, test -- target dataset to evaluate.
        """
        res_dict: dict = {}

        if self.few_shot_num and self.few_shot_num < 0:
            raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')

        logger.info(f'Use settings: '
                    f'> few_shot_num: {self.few_shot_num}, '
                    f'> few_shot_split: {self.train_split}, '
                    f'> target_eval_split: {self.eval_split}')

        for sub_name, sub_data_dict in data_dict.items():
            few_shot_data = []
            if self.train_split and self.few_shot_num and self.few_shot_num > 0:
                few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
                few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
                                                          self.few_shot_num,
                                                          few_shot_random=few_shot_random)

            res_dict[sub_name] = []
            for sample_d in sub_data_dict[self.eval_split]:
                prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
                prompt_d[AnswerKeys.RAW_INPUT] = sample_d
                res_dict[sub_name].append(prompt_d)

        return res_dict

    def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):

        if k > len(data_list):
            k = len(data_list)
        if few_shot_random:
            return random.sample(data_list, k)
        else:
            return data_list[:k]

    def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
        """
        Compute evaluation result by specific metrics.

        Args:
            review_res_list: list, the review result list, each item of which is match result for gold and pred.

        Returns:
            Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
        """
        if len(self.metric_list) == 0:
            raise ValueError('No metric list found for the benchmark.')

        res_list = []
        for metric_str in self.metric_list:
            metric = metric_registry.get(metric_str)
            metric_name = metric.name
            metric_func = metric.object
            if isinstance(review_res_list, dict):
                review_res = review_res_list.get(metric_name, [])
            else:
                review_res = review_res_list
            res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
        return res_list

    def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
                            **kwargs) -> Dict[str, List[float]]:
        """
        compute weighted mean of score of all samples

        Args:
            review_res_list: [score1, score2, ...]

        Returns:
            avg_res: Dict[str, List[float]]

        """
        if len(review_res_list) > 0 and isinstance(review_res_list[0], list):
            review_res_list = [item for sublist in review_res_list for item in sublist]

        items = defaultdict(list)
        for scores in review_res_list:
            if isinstance(scores, dict):
                for k, v in scores.items():
                    items[k].append(v)
            else:
                items['AverageAccuracy'].append(scores)
        return items

    def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report:
        """
        Generate report for the evaluation results for all subsets.

        Args:
            subset_score_map: The subset-score map.
                e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}

            model_name: The evaluation model name.

        Returns: The evaluation report.

        Here is a format example for gsm8k:
        {
            "name": "qwen2.5_gsm8k",
            "metrics": [
                {
                    "name": "AverageAccuracy",
                    "categories": [
                        {
                            "name": "default",
                            "subsets": [
                                {
                                    "name": "main",
                                    "score": 0.0,
                                    "num": 2
                                }
                            ],
                            "num": 2,
                            "score": 0.0,
                            "macro_score": 0.0
                        }
                    ],
                    "num": 2,
                    "score": 0.0,
                    "macro_score": 0.0
                }
            ],
            "dataset_name": "gsm8k",
            "model_name": "qwen2.5"
        }
        """  # noqa: E501
        return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs)

    def post_process_report(self, report: Report, **kwargs):
        """
        Post-process the report after generation. Draw a chart, save to file, etc.
        This method can be overridden to customize the report format or content.

        Args:
            report (Report): The generated report.
        """
        pass

    def gen_prompt_data(self,
                        prompt: str,
                        system_prompt: Optional[str] = None,
                        choices: Optional[List[str]] = None,
                        index: Optional[Union[int, str]] = None,
                        id: Optional[Union[int, str]] = None,
                        messages: Optional[List[dict]] = None,
                        **kwargs) -> dict:
        """
        Generates a dictionary representation of prompt data for evaluation or inference.

        Args:
            prompt (str): The main prompt or input text. Can also be a list of prompts.
            system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None.
            choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks.
                If not provided, uses self.choices. Defaults to None.
            index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt.
                Defaults to 0 if not provided. Defaults to None.
            id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None.
            messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None.
                If messages is provided, it will be used as the prompt data instead of the prompt string.

        Returns:
            dict: A dictionary representation of the prompt data, suitable for further processing or model input.
        """  # noqa: E501
        data = [prompt] if not isinstance(prompt, list) else prompt
        prompt_data = PromptData(
            data=data,
            multi_choices=choices or self.choices,
            system_prompt=system_prompt or self.system_prompt,
            index=index or 0,
            id=id,
            messages=messages)
        return prompt_data.to_dict()

    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
        """
        Generate model prompt from raw input, unify the prompt format for different datasets.
        The input format is compatible with OpenAI Chat Completions APIs.

        Args:
            input_d (Any): The raw input. Depending on the dataset.
            subset_name (str): The subset name.
            few_shot_list (list): The few-shot examples.

        Returns:
            For class ChatGenerationModelAdapter, the output format is:
                {'data': [full_prompt], 'system_prompt': (str, optional)},  -- full_prompt: str, the constructed prompt for each sample from dataset.
            For class MultiChoiceModelAdapter, the output format is:
                {'data': [full_prompt], 'multi_choices': self.choices}  -- full_prompt: str, the constructed prompt for each sample from dataset.
            For class ContinuationEvalModelAdapter, the output format is:
                {'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
        """  # noqa: E501
        raise NotImplementedError

    @abstractmethod
    def get_gold_answer(self, input_d: Any) -> Any:
        """
        Parse the raw input labels (gold).

        Args:
            input_d: input raw data. Depending on the dataset.

        Returns:
            The parsed input. e.g. gold answer ... Depending on the dataset.
        """
        raise NotImplementedError

    def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
        """
        Parse the predicted result and extract proper answer.

        Args:
            result: Predicted answer from the model. Usually a string for chat.
            raw_input_d: The raw input. Depending on the dataset.
            eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'

        Returns:
            The parsed answer. Depending on the dataset. Usually a string for chat.
        """
        return result

    def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
        """
        Parse the predicted result using LLM.

        Args:
            result (Any): The predicted answer from the model.
            raw_input_d (dict): The raw input data.
            eval_type (str): The evaluation type, default is 'checkpoint'.

        Returns:
            The parsed answer. Usually a string for chat.
        """
        return result

    @abstractmethod
    def match(self, gold: Any, pred: Any) -> Any:
        """
        Match the gold answer and the predicted answer.

        Args:
            gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
                        e.g. 'A', extracted from get_gold_answer method.
            pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
                        e.g. 'B', extracted from parse_pred_result method.

        Returns:
            The match result. Usually a score (float) for chat/multiple-choice-questions.
        """
        raise NotImplementedError

    def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
        """
        Use LLM as a judge to evaluate the predicted answer against the gold answer.

        Args:
            gold (Any): The golden answer.
            pred (Any): The predicted answer.

        Returns:
            The match result as a float score between 0 and 1.
        """
        # Default judge handling
        if judge is None:
            logger.warning('No judge LLM provided, please specify a judge LLM in the config.')
            return 0

        # Extract question from raw_input if available
        raw_input = kwargs.get('raw_input', {})
        question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
        # Find the first non-empty question key in raw_input
        question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)

        # Request judge and obtain score
        prompt = judge.build_prompt(pred, gold, question)
        judge_response = judge(prompt)
        score = judge.get_score(judge_response)

        return score