160 lines
6.5 KiB
Python
160 lines
6.5 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
import json
|
|
import os
|
|
|
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.constants import EvalType, OutputType
|
|
from evalscope.metrics import exact_match
|
|
from evalscope.metrics.completion_parsers import ResponseParser
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
# flake8: noqa
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
@Benchmark.register(
|
|
name='arc',
|
|
pretty_name='ARC',
|
|
tags=['Reasoning', 'MCQ'],
|
|
description=
|
|
'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
|
|
dataset_id='modelscope/ai2_arc',
|
|
model_adapter=OutputType.GENERATION,
|
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
metric_list=['AverageAccuracy'],
|
|
few_shot_num=0,
|
|
train_split='train',
|
|
eval_split='test',
|
|
prompt_template=
|
|
'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n{query}\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.', # noqa
|
|
)
|
|
class ARCAdapter(DataAdapter):
|
|
|
|
def __init__(self, **kwargs):
|
|
few_shot_num = kwargs.get('few_shot_num', None)
|
|
if few_shot_num is None:
|
|
# Use 0-shot by default
|
|
logger.info(f'Set 0-shot examples by system for ARC.')
|
|
few_shot_num = 0
|
|
|
|
if few_shot_num != 0:
|
|
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
self.choices = ['A', 'B', 'C', 'D']
|
|
|
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
"""
|
|
Load the dataset from local disk.
|
|
|
|
dataset_name_or_path: str, the dataset id or path. e.g. 'arc'
|
|
subset_list: list, the subset list to load. e.g. ['ARC-Easy', 'ARC-Challenge']
|
|
work_dir: str, the local root data directory. e.g. '/path/to/data'
|
|
kwargs: dict, other arguments.
|
|
"""
|
|
data_dict = {}
|
|
for subset_name in subset_list:
|
|
if os.path.exists(dataset_name_or_path):
|
|
subset_path = os.path.join(dataset_name_or_path, subset_name)
|
|
else:
|
|
subset_path = os.path.join(work_dir, dataset_name_or_path, subset_name)
|
|
for split_name in ['Train', 'Test']:
|
|
split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
|
|
if os.path.exists(split_path):
|
|
with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
|
|
rows = []
|
|
for line in in_f:
|
|
item = json.loads(line.strip())
|
|
raw_choices = item['question']['choices']
|
|
rows.append({
|
|
'id': item['id'],
|
|
'question': item['question']['stem'],
|
|
'choices': {
|
|
'text': [d['text'] for d in raw_choices],
|
|
'label': [d['label'] for d in raw_choices]
|
|
},
|
|
'answerKey': item['answerKey'],
|
|
})
|
|
|
|
if subset_name in data_dict:
|
|
data_dict[subset_name].update({split_name.lower(): rows})
|
|
else:
|
|
data_dict[subset_name] = {split_name.lower(): rows}
|
|
|
|
return data_dict
|
|
|
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
"""
|
|
Generate model prompt from raw data, unify the prompt format for ARC benchmark.
|
|
|
|
Args:
|
|
input_d (dict): The raw input. A single data format of the ARC:
|
|
|
|
{
|
|
'id': 'Mercury_7220990',
|
|
'question': 'Which factor will most likely cause a person to develop a fever?',
|
|
'choices':
|
|
{
|
|
'text':['a leg muscle relaxing after exercise',
|
|
'a bacterial population in the bloodstream',
|
|
'several viral particles on the skin',
|
|
'carbohydrates being digested in the stomach'],
|
|
'label': ['A', 'B', 'C', 'D']
|
|
},
|
|
'answerKey': 'B'
|
|
}
|
|
|
|
Returns:
|
|
{'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
|
|
"""
|
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
|
|
full_prompt = self.prompt_template.format(query=context)
|
|
|
|
return self.gen_prompt_data(full_prompt)
|
|
|
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
# Get the gold choice
|
|
return input_d.get('answerKey', '')
|
|
|
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
"""
|
|
Parse the model output to get the answer. Could be the best choice index.
|
|
|
|
Args:
|
|
result: Predicted answer from the model. Usually a string for chat.
|
|
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
|
|
Returns:
|
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
"""
|
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
return result
|
|
else:
|
|
return ResponseParser.parse_first_option(text=result, options=self.choices)
|
|
|
|
def match(self, gold: str, pred: str) -> float:
|
|
return exact_match(gold=gold, pred=pred)
|
|
|
|
@classmethod
|
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
|
|
example: str = input_d['question']
|
|
|
|
choices_texts: list = input_d['choices']['text']
|
|
choices_labels: list = input_d['choices']['label']
|
|
choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
|
|
example += '\n' + choices_prompts
|
|
|
|
if include_answer:
|
|
example += '\nAnswer:'
|
|
example += ' {}\n\n'.format(input_d['answerKey'])
|
|
|
|
return example
|