248 lines
8.4 KiB
Python
248 lines
8.4 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
|
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.constants import AnswerKeys
|
|
from evalscope.metrics import exact_match
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
# flake8: noqa
|
|
|
|
logger = get_logger()
|
|
|
|
# BBH multiple choice subset list
|
|
MULTIPLE_CHOICE = 'multiple_choice'
|
|
MULTIPLE_CHOICE_LIST = [
|
|
'temporal_sequences',
|
|
'disambiguation_qa',
|
|
'date_understanding',
|
|
'tracking_shuffled_objects_three_objects',
|
|
'penguins_in_a_table',
|
|
'geometric_shapes',
|
|
'snarks',
|
|
'ruin_names',
|
|
'tracking_shuffled_objects_seven_objects',
|
|
'tracking_shuffled_objects_five_objects',
|
|
'logical_deduction_three_objects',
|
|
'hyperbaton',
|
|
'logical_deduction_five_objects',
|
|
'logical_deduction_seven_objects',
|
|
'movie_recommendation',
|
|
'salient_translation_error_detection',
|
|
'reasoning_about_colored_objects',
|
|
]
|
|
|
|
# The free form subset list of BBH dataset
|
|
FREE_FORM = 'free_form'
|
|
FREE_FORM_LIST = [
|
|
'multistep_arithmetic_two',
|
|
'navigate',
|
|
'dyck_languages',
|
|
'word_sorting',
|
|
'sports_understanding',
|
|
'boolean_expressions',
|
|
'object_counting',
|
|
'formal_fallacies',
|
|
'causal_judgement',
|
|
'web_of_lies',
|
|
]
|
|
|
|
# BBH sub-task type
|
|
TASK_TYPE = 'task_type'
|
|
SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
|
|
|
@Benchmark.register(
|
|
name='bbh',
|
|
pretty_name='BBH',
|
|
tags=['Reasoning'],
|
|
description=
|
|
'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
|
|
dataset_id='modelscope/bbh',
|
|
subset_list=SUBSET_LIST,
|
|
metric_list=['AverageAccuracy'],
|
|
few_shot_num=3,
|
|
train_split=None,
|
|
eval_split='test',
|
|
prompt_template="Q: {query}\nA: Let's think step by step.",
|
|
)
|
|
class BBHAdapter(DataAdapter):
|
|
"""
|
|
Adapter for BBH free-form and multiple-choices sub-tasks.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
|
|
if few_shot_num != 3 and few_shot_num != 0:
|
|
logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
|
|
f'Use 3-shot by default.')
|
|
kwargs['few_shot_num'] = 3
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
data_dict = {}
|
|
for subset_name in subset_list:
|
|
for split_name in [self.eval_split]:
|
|
if os.path.exists(dataset_name_or_path):
|
|
file_path = os.path.join(dataset_name_or_path, f'{subset_name}.json')
|
|
else:
|
|
file_path: str = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}.json')
|
|
if os.path.exists(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
examples = json.load(f)['examples']
|
|
if subset_name in data_dict:
|
|
data_dict[subset_name].update({split_name: examples})
|
|
else:
|
|
data_dict[subset_name] = {split_name: examples}
|
|
|
|
return data_dict
|
|
|
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
"""
|
|
Generate model prompt from raw data, unify the prompt format for bbh(multiple choice) benchmark.
|
|
|
|
Args:
|
|
input_d (dict): The raw input. A single data format of the BBH:
|
|
|
|
{
|
|
'input': '((-1 + 2 + 9 * 5) - (-2 + -4 + -4 * -7)) =',
|
|
'target': '24',
|
|
}
|
|
|
|
Returns:
|
|
{'data': ['xxx']}
|
|
"""
|
|
# few_shot_list: should be ['xxxx']
|
|
if len(few_shot_list) > 0:
|
|
cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
|
|
else:
|
|
cot_prompts = ''
|
|
full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
|
|
|
|
return self.gen_prompt_data(full_prompt)
|
|
|
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
"""
|
|
Generate dataset prompts from raw input, unify the prompt format for different datasets.
|
|
|
|
Args:
|
|
data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
|
|
|
|
Returns:
|
|
{'subset_name': [prompt_d_1, prompt_d_2, ...]}
|
|
prompt_d_i (dict): refer to the output of gen_prompt method.
|
|
|
|
e.g. train -- few-shot data, test -- target dataset to evaluate.
|
|
"""
|
|
res_dict: dict = {}
|
|
|
|
if self.few_shot_num < 0:
|
|
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
|
|
logger.info(f'Use default settings: '
|
|
f'> few_shot_num: {self.few_shot_num}, '
|
|
f'> few_shot_split: {self.train_split}, '
|
|
f'> target_eval_split: {self.eval_split}')
|
|
|
|
for sub_name, sub_data_dict in data_dict.items():
|
|
few_shot_data = []
|
|
if self.few_shot_num > 0:
|
|
with open(
|
|
os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r',
|
|
encoding='utf-8') as f:
|
|
cot_prompt_str = f.read()
|
|
few_shot_data = [cot_prompt_str]
|
|
|
|
res_dict[sub_name] = []
|
|
for sample_d in sub_data_dict[self.eval_split]:
|
|
prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=few_shot_data)
|
|
sample_d_new = sample_d.copy()
|
|
if sub_name in MULTIPLE_CHOICE_LIST:
|
|
sample_d_new[TASK_TYPE] = MULTIPLE_CHOICE
|
|
elif sub_name in FREE_FORM_LIST:
|
|
sample_d_new[TASK_TYPE] = FREE_FORM
|
|
else:
|
|
raise ValueError(f'Invalid subset name: {sub_name}')
|
|
|
|
prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
|
|
res_dict[sub_name].append(prompt_d)
|
|
|
|
return res_dict
|
|
|
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
# Get the gold choice
|
|
gold = input_d.get('target', '')
|
|
# remove brackets
|
|
if gold is None:
|
|
logger.error(f'BBHAdapter: gold is None.')
|
|
gold = gold.replace('(', '').replace(')', '')
|
|
return gold
|
|
|
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
"""
|
|
Parse the model output to get the answer. Could be the best choice index.
|
|
|
|
Args:
|
|
result: Predicted answer from the model. Usually a string for chat.
|
|
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
|
|
Returns:
|
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
"""
|
|
# Note: to use same extraction method for both of checkpoint/service/custom.
|
|
task_type: str = raw_input_d.get(TASK_TYPE)
|
|
|
|
if task_type == MULTIPLE_CHOICE:
|
|
return self._extract_mc_answer(result)
|
|
elif task_type == FREE_FORM:
|
|
return self._extract_ff_answer(result)
|
|
else:
|
|
raise ValueError(f'Invalid task type: {task_type}')
|
|
|
|
def match(self, gold: str, pred: str) -> float:
|
|
return exact_match(gold=gold, pred=pred)
|
|
|
|
@classmethod
|
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
"""
|
|
Extract the answer from the model output for Multiple choice task.
|
|
"""
|
|
ans_line = ans.split('answer is ')
|
|
if len(ans_line) != 1:
|
|
ans = ans_line[1].strip()
|
|
match = re.search(r'\(([A-Z])\)*', ans)
|
|
if match:
|
|
return match.group(1)
|
|
match = re.search(r'([A-Z])', ans)
|
|
if match:
|
|
return match.group(1)
|
|
return ans
|
|
|
|
@classmethod
|
|
def _extract_ff_answer(cls, ans: str):
|
|
"""
|
|
Extract the answer from the model output for Free-form task.
|
|
"""
|
|
pattern = r'answer is\s+(.*?)\.'
|
|
|
|
match = re.search(pattern, ans)
|
|
if match:
|
|
res = match.group(1)
|
|
return res
|
|
|
|
ans_line = ans.split('answer is ')
|
|
if len(ans_line) != 1:
|
|
ans = ans_line[1].strip()
|
|
ans = ans.split('\n')[0]
|
|
if ans.endswith('.'):
|
|
ans = ans[:-1]
|
|
return ans
|