evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/gpqa/gpqa_adapter.py

import os
import random
import re

from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType, OutputType
from evalscope.metrics import exact_match


@Benchmark.register(
    name='gpqa',
    pretty_name='GPQA',
    tags=['MCQ', 'Knowledge'],
    description=
    'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.',  # noqa: E501
    dataset_id='modelscope/gpqa',
    model_adapter=OutputType.GENERATION,
    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
    subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
    metric_list=['AveragePass@1'],
    few_shot_num=5,
    train_split=None,
    eval_split='train',  # only have train split
    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
)
class GPQAAdapter(DataAdapter):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.choices = ['A', 'B', 'C', 'D']
        if self.few_shot_num and self.few_shot_num > 0:
            self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n'  # noqa: E501
            self.prompt_prefix += open(
                os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
                encoding='utf-8').read() + '\nQuestion: '
        else:
            self.prompt_prefix = 'What is the correct answer to this question:'

    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
        """
        Generate model prompt from input data.
        example:
        {
            "question":"Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?",
            "choice1":"1/2",
            "choice2":"1/4",
            "choice3":"2/3",
            "choice4":"1/8",
            "answer":"C",
        }
        """  # noqa: E501
        processed_input_d = self.__process_input(input_d)
        input_d['answer'] = processed_input_d['answer']  # add answer to input_d for answer extraction
        query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}"  # noqa: E501

        prompt = self.prompt_template.format(query=query)
        return self.gen_prompt_data(prompt)

    def __process_input(self, input_d: dict) -> dict:

        def preprocess(text):
            if text is None:
                return ' '
            text = text.strip()
            text = text.replace(' [title]', '. ')
            text = re.sub('\\[.*?\\]', '', text)
            text = text.replace('  ', ' ')
            return text

        choices = [
            preprocess(input_d['Incorrect Answer 1']),
            preprocess(input_d['Incorrect Answer 2']),
            preprocess(input_d['Incorrect Answer 3']),
            preprocess(input_d['Correct Answer']),
        ]
        random.shuffle(choices)
        correct_answer_index = choices.index(preprocess(input_d['Correct Answer']))

        out_doc = {
            'choices': [choices[0], choices[1], choices[2], choices[3]],
            'answer': f'{chr(65 + correct_answer_index)}',
        }
        return out_doc

    def __form_options(self, options: list):
        option_str = 'Choices:\n'
        for opt, choice in zip(options, self.choices):
            option_str += f'({choice}) {opt}' + '\n'
        return option_str

    def get_gold_answer(self, input_d: dict) -> str:
        """
        Parse the raw input labels (gold).
        """
        return input_d['answer']

    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
        """
        Parse the predicted result and extract proper answer.
        """
        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
            return result
        else:
            return GPQAAdapter.get_multiple_choice_answer(result)

    def match(self, gold: str, pred: str) -> float:
        """
        Match the gold answer and the predicted answer.
        """
        return exact_match(gold=gold, pred=pred)

    @staticmethod
    def get_multiple_choice_answer(pred: str):
        tmp = re.findall(r'\b(A|B|C|D)\b', pred.upper())
        if tmp:
            pred = tmp
        else:
            pred = [pred.strip().strip('.')]

        if len(pred) == 0:
            pred = ''
        else:
            pred = pred[-1]

        # Remove the period at the end, again!
        pred = pred.rstrip('.').rstrip('/')

        return pred