evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/competition_math/competition_math_adapter.py

126 lines
6.9 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright (c) EleutherAI, Inc. and its affiliates.
import glob
import json
import os
from collections import defaultdict
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
from evalscope.utils.logger import get_logger
# flake8: noqa
logger = get_logger()
@Benchmark.register(
name='competition_math',
pretty_name='MATH',
tags=['Mathematics'],
description=
'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
dataset_id='modelscope/competition_math',
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
metric_list=['AveragePass@1'],
few_shot_num=4,
train_split=None,
eval_split='test',
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
)
class CompetitionMathAdapter(DataAdapter):
""" To be tested for all models. """
def __init__(self, **kwargs):
few_shot_num = kwargs.get('few_shot_num', 4)
if few_shot_num != 4 and few_shot_num != 0:
logger.error(f'The MATH benchmark ONLY supports 4-shot by system or 0-shot settings, '
f'but got {few_shot_num}. Use 4-shot by default.')
kwargs['few_shot_num'] = 4
super().__init__(**kwargs)
def load(self, **kwargs):
# default load all levels
kwargs['subset_list'] = ['default']
data_dict = super().load(**kwargs)
return self.reformat_subset(data_dict, subset_key='level')
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
data_dict = defaultdict(dict)
for subset_name in subset_list:
for split_name in [self.train_split, self.eval_split]:
if os.path.exists(dataset_name_or_path):
split_dir = os.path.join(dataset_name_or_path, split_name)
else:
split_dir = os.path.join(work_dir, dataset_name_or_path, split_name)
split_files = glob.glob(os.path.join(split_dir, '**', '*.json'))
split_data = []
for file_path in split_files:
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
split_data.append(json.load(f))
data_dict[subset_name][split_name] = split_data
return data_dict
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
"""
Generate the prompt for the model input.
Args:
input_d: raw input dict.
{"problem": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?", "level": "Level 3", "type": "Algebra", "solution": "The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$. Therefore, the graph has $\\boxed{2}$ vertical asymptotes."}
few_shot_list: few shot list. Each item is a raw input dict.
**kwargs:
Returns:
{'data': [prompt]}
"""
use_fewshot = self.few_shot_num > 0
query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
full_prompt = self.prompt_template.format(query=query)
return self.gen_prompt_data(full_prompt)
def get_gold_answer(self, input_d: dict) -> str:
# Extract the gold answer from the input dict.
return strip_answer_string(extract_answer(input_d['solution']))
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
"""
Parse the model output to get the answer. Could be the best choice index.
Args:
result: Predicted answer from the model. Usually a string for chat.
raw_input_d (dict): The raw input. Depending on the dataset.
eval_type: 'checkpoint' or 'service' or `custom`
Returns:
The parsed answer. Depending on the dataset. Usually a string for chat.
"""
# Note: Use same extraction method for both of checkpoint/service/custom
result = strip_answer_string(extract_answer(result))
return result
def match(self, gold: str, pred: str) -> float:
res = math_equal(pred, gold)
return 1.0 if res else 0.0
@classmethod
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
problem: str = input_d['problem']
if use_fewshot:
# Use 4-shot examples by system
context = (
'Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'
'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
f'Problem:\n{problem}\nSolution:\n')
else:
context = 'Problem:\n' + problem + '\nSolution:\n'
return context