evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/competition_math/competition_math_adapter.py

# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright (c) EleutherAI, Inc. and its affiliates.
import glob
import json
import os
from collections import defaultdict

from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()


@Benchmark.register(
    name='competition_math',
    pretty_name='MATH',
    tags=['Mathematics'],
    description=
    'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
    dataset_id='modelscope/competition_math',
    subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
    metric_list=['AveragePass@1'],
    few_shot_num=4,
    train_split=None,
    eval_split='test',
    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
)
class CompetitionMathAdapter(DataAdapter):
    """ To be tested for all models. """

    def __init__(self, **kwargs):

        few_shot_num = kwargs.get('few_shot_num', 4)
        if few_shot_num != 4 and few_shot_num != 0:
            logger.error(f'The MATH benchmark ONLY supports 4-shot by system or 0-shot settings, '
                         f'but got {few_shot_num}. Use 4-shot by default.')
            kwargs['few_shot_num'] = 4

        super().__init__(**kwargs)

    def load(self, **kwargs):
        # default load all levels
        kwargs['subset_list'] = ['default']
        data_dict = super().load(**kwargs)
        return self.reformat_subset(data_dict, subset_key='level')

    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
        data_dict = defaultdict(dict)
        for subset_name in subset_list:
            for split_name in [self.train_split, self.eval_split]:
                if os.path.exists(dataset_name_or_path):
                    split_dir = os.path.join(dataset_name_or_path, split_name)
                else:
                    split_dir = os.path.join(work_dir, dataset_name_or_path, split_name)
                split_files = glob.glob(os.path.join(split_dir, '**', '*.json'))
                split_data = []
                for file_path in split_files:
                    if os.path.exists(file_path):
                        with open(file_path, 'r', encoding='utf-8') as f:
                            split_data.append(json.load(f))
                data_dict[subset_name][split_name] = split_data

        return data_dict

    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
        """
        Generate the prompt for the model input.

        Args:
            input_d: raw input dict.
                {"problem": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?", "level": "Level 3", "type": "Algebra", "solution": "The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\\boxed{2}$ vertical asymptotes."}

            few_shot_list:  few shot list. Each item is a raw input dict.
            **kwargs:

        Returns:
            {'data': [prompt]}
        """
        use_fewshot = self.few_shot_num > 0
        query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
        full_prompt = self.prompt_template.format(query=query)
        return self.gen_prompt_data(full_prompt)

    def get_gold_answer(self, input_d: dict) -> str:
        # Extract the gold answer from the input dict.
        return strip_answer_string(extract_answer(input_d['solution']))

    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
        """
        Parse the model output to get the answer. Could be the best choice index.

        Args:
            result: Predicted answer from the model. Usually a string for chat.
            raw_input_d (dict): The raw input. Depending on the dataset.
            eval_type: 'checkpoint' or 'service' or `custom`

        Returns:
            The parsed answer. Depending on the dataset. Usually a string for chat.
        """
        # Note: Use same extraction method for both of checkpoint/service/custom
        result = strip_answer_string(extract_answer(result))
        return result

    def match(self, gold: str, pred: str) -> float:
        res = math_equal(pred, gold)
        return 1.0 if res else 0.0

    @classmethod
    def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
        problem: str = input_d['problem']

        if use_fewshot:
            # Use 4-shot examples by system
            context = (
                'Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'
                'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
                'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
                'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
                f'Problem:\n{problem}\nSolution:\n')
        else:
            context = 'Problem:\n' + problem + '\nSolution:\n'
        return context