103 lines
3.7 KiB
Python
103 lines
3.7 KiB
Python
import os
|
|
import re
|
|
from typing import Any, List
|
|
|
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.constants import AnswerKeys, EvalType
|
|
from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
|
|
|
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
|
@Benchmark.register(
|
|
name='process_bench',
|
|
pretty_name='ProcessBench',
|
|
tags=['Mathematical', 'Reasoning'],
|
|
description=
|
|
'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
|
|
dataset_id='Qwen/ProcessBench',
|
|
subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
|
|
metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
|
|
few_shot_num=0,
|
|
train_split=None,
|
|
eval_split='test',
|
|
)
|
|
class ProcessBenchAdapter(DataAdapter):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt'), encoding='utf-8').read()
|
|
|
|
# register metrics
|
|
metric_registry.register(Metric(name='error_acc', object=mean))
|
|
metric_registry.register(Metric(name='correct_acc', object=mean))
|
|
metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
|
|
|
|
def load(self, **kwargs):
|
|
# default load all levels
|
|
kwargs['split_as_subset'] = True
|
|
data_dict = super().load(**kwargs)
|
|
return data_dict
|
|
|
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
|
|
problem = input_d['problem']
|
|
steps = input_d['steps']
|
|
tagged_response = ''
|
|
for sdx, step in enumerate(steps):
|
|
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
tagged_response = tagged_response.strip()
|
|
|
|
full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
|
|
|
|
return self.gen_prompt_data(full_prompt)
|
|
|
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
"""
|
|
Parse the raw input labels (gold).
|
|
"""
|
|
return int(input_d['label'])
|
|
|
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
"""
|
|
Parse the predicted result and extract proper answer.
|
|
"""
|
|
pred = ProcessBenchAdapter.extract_answer(result)
|
|
try:
|
|
pred = int(pred)
|
|
except Exception:
|
|
pred = None
|
|
return pred
|
|
|
|
def match(self, gold: int, pred: int) -> float:
|
|
"""
|
|
Match the gold answer and the predicted answer.
|
|
"""
|
|
return gold == pred
|
|
|
|
def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
|
|
reviews_list = kwargs['reviews_list']
|
|
error_data = []
|
|
correct_data = []
|
|
for res, raw in zip(review_res_list, reviews_list):
|
|
if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
|
|
correct_data.append(res)
|
|
else:
|
|
error_data.append(res)
|
|
data = {}
|
|
if len(correct_data) != 0:
|
|
data.update({'correct_acc': correct_data})
|
|
if len(error_data) != 0:
|
|
data.update({'error_acc': error_data})
|
|
data.update({'simple_f1_score': (correct_data, error_data)})
|
|
return super().compute_metric(data)
|
|
|
|
@staticmethod
|
|
def extract_answer(solution_text: str):
|
|
boxed_pattern = r'\\boxed\{([^}]*)\}'
|
|
matches = re.findall(boxed_pattern, solution_text)
|
|
if matches:
|
|
return matches[-1].strip()
|
|
return None
|