evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/arc/arc_adapter.py

# Copyright (c) Alibaba, Inc. and its affiliates.

import json
import os

from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType, OutputType
from evalscope.metrics import exact_match
from evalscope.metrics.completion_parsers import ResponseParser
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()


@Benchmark.register(
    name='arc',
    pretty_name='ARC',
    tags=['Reasoning', 'MCQ'],
    description=
    'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.',  # noqa: E501
    dataset_id='modelscope/ai2_arc',
    model_adapter=OutputType.GENERATION,
    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
    subset_list=['ARC-Easy', 'ARC-Challenge'],
    metric_list=['AverageAccuracy'],
    few_shot_num=0,
    train_split='train',
    eval_split='test',
    prompt_template=
    'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n{query}\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.',  # noqa
)
class ARCAdapter(DataAdapter):

    def __init__(self, **kwargs):
        few_shot_num = kwargs.get('few_shot_num', None)
        if few_shot_num is None:
            # Use 0-shot by default
            logger.info(f'Set 0-shot examples by system for ARC.')
            few_shot_num = 0

        if few_shot_num != 0:
            logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')

        super().__init__(**kwargs)

        self.choices = ['A', 'B', 'C', 'D']

    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
        """
        Load the dataset from local disk.

        dataset_name_or_path: str, the dataset id or path. e.g. 'arc'
        subset_list: list, the subset list to load. e.g. ['ARC-Easy', 'ARC-Challenge']
        work_dir: str, the local root data directory. e.g. '/path/to/data'
        kwargs: dict, other arguments.
        """
        data_dict = {}
        for subset_name in subset_list:
            if os.path.exists(dataset_name_or_path):
                subset_path = os.path.join(dataset_name_or_path, subset_name)
            else:
                subset_path = os.path.join(work_dir, dataset_name_or_path, subset_name)
            for split_name in ['Train', 'Test']:
                split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
                if os.path.exists(split_path):
                    with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
                        rows = []
                        for line in in_f:
                            item = json.loads(line.strip())
                            raw_choices = item['question']['choices']
                            rows.append({
                                'id': item['id'],
                                'question': item['question']['stem'],
                                'choices': {
                                    'text': [d['text'] for d in raw_choices],
                                    'label': [d['label'] for d in raw_choices]
                                },
                                'answerKey': item['answerKey'],
                            })

                        if subset_name in data_dict:
                            data_dict[subset_name].update({split_name.lower(): rows})
                        else:
                            data_dict[subset_name] = {split_name.lower(): rows}

        return data_dict

    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
        """
        Generate model prompt from raw data, unify the prompt format for ARC benchmark.

        Args:
            input_d (dict): The raw input. A single data format of the ARC:

            {
                'id': 'Mercury_7220990',
                'question': 'Which factor will most likely cause a person to develop a fever?',
                'choices':
                    {
                        'text':['a leg muscle relaxing after exercise',
                                'a bacterial population in the bloodstream',
                                'several viral particles on the skin',
                                'carbohydrates being digested in the stomach'],
                        'label': ['A', 'B', 'C', 'D']
                    },
                'answerKey': 'B'
            }

        Returns:
            {'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
        """
        few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
        context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)

        full_prompt = self.prompt_template.format(query=context)

        return self.gen_prompt_data(full_prompt)

    def get_gold_answer(self, input_d: dict) -> str:
        # Get the gold choice
        return input_d.get('answerKey', '')

    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
        """
        Parse the model output to get the answer. Could be the best choice index.

        Args:
            result: Predicted answer from the model. Usually a string for chat.
            raw_input_d (dict): The raw input. Depending on the dataset.
            eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'

        Returns:
            The parsed answer. Depending on the dataset. Usually a string for chat.
        """
        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
            return result
        else:
            return ResponseParser.parse_first_option(text=result, options=self.choices)

    def match(self, gold: str, pred: str) -> float:
        return exact_match(gold=gold, pred=pred)

    @classmethod
    def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:

        example: str = input_d['question']

        choices_texts: list = input_d['choices']['text']
        choices_labels: list = input_d['choices']['label']
        choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
        example += '\n' + choices_prompts

        if include_answer:
            example += '\nAnswer:'
            example += ' {}\n\n'.format(input_d['answerKey'])

        return example