evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/aime/aime25_adapter.py

from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()


@Benchmark.register(
    name='aime25',
    pretty_name='AIME-2025',
    tags=['Mathematics'],
    description=
    'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
    dataset_id='opencompass/AIME2025',
    subset_list=['AIME2025-I', 'AIME2025-II'],
    metric_list=['AveragePass@1'],
    few_shot_num=0,
    train_split=None,
    eval_split='test',  # Only train set is available
    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
)
class AIME25Adapter(DataAdapter):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
        """
        Generate the prompt for the model input.
        """
        problem = input_d['question']
        full_prompt = self.prompt_template.format(query=problem)

        return self.gen_prompt_data(full_prompt)

    def get_gold_answer(self, input_d: dict) -> str:
        # Extract the gold answer from the input dict.
        return strip_answer_string(input_d['answer'])

    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
        """
        Parse the model output to get the answer. Could be the best choice index.
        """
        # Note: Use same extraction method for both of checkpoint/service/custom
        result = strip_answer_string(extract_answer(result))
        return result

    def match(self, gold: str, pred: str) -> float:
        res = math_equal(pred, gold)
        return 1.0 if res else 0.0