evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/arc/arc_adapter.py

160 lines
6.5 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import json
import os
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType, OutputType
from evalscope.metrics import exact_match
from evalscope.metrics.completion_parsers import ResponseParser
from evalscope.utils.logger import get_logger
# flake8: noqa
logger = get_logger()
@Benchmark.register(
name='arc',
pretty_name='ARC',
tags=['Reasoning', 'MCQ'],
description=
'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
dataset_id='modelscope/ai2_arc',
model_adapter=OutputType.GENERATION,
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
subset_list=['ARC-Easy', 'ARC-Challenge'],
metric_list=['AverageAccuracy'],
few_shot_num=0,
train_split='train',
eval_split='test',
prompt_template=
'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n{query}\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.', # noqa
)
class ARCAdapter(DataAdapter):
def __init__(self, **kwargs):
few_shot_num = kwargs.get('few_shot_num', None)
if few_shot_num is None:
# Use 0-shot by default
logger.info(f'Set 0-shot examples by system for ARC.')
few_shot_num = 0
if few_shot_num != 0:
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
super().__init__(**kwargs)
self.choices = ['A', 'B', 'C', 'D']
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
"""
Load the dataset from local disk.
dataset_name_or_path: str, the dataset id or path. e.g. 'arc'
subset_list: list, the subset list to load. e.g. ['ARC-Easy', 'ARC-Challenge']
work_dir: str, the local root data directory. e.g. '/path/to/data'
kwargs: dict, other arguments.
"""
data_dict = {}
for subset_name in subset_list:
if os.path.exists(dataset_name_or_path):
subset_path = os.path.join(dataset_name_or_path, subset_name)
else:
subset_path = os.path.join(work_dir, dataset_name_or_path, subset_name)
for split_name in ['Train', 'Test']:
split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
if os.path.exists(split_path):
with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
rows = []
for line in in_f:
item = json.loads(line.strip())
raw_choices = item['question']['choices']
rows.append({
'id': item['id'],
'question': item['question']['stem'],
'choices': {
'text': [d['text'] for d in raw_choices],
'label': [d['label'] for d in raw_choices]
},
'answerKey': item['answerKey'],
})
if subset_name in data_dict:
data_dict[subset_name].update({split_name.lower(): rows})
else:
data_dict[subset_name] = {split_name.lower(): rows}
return data_dict
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
"""
Generate model prompt from raw data, unify the prompt format for ARC benchmark.
Args:
input_d (dict): The raw input. A single data format of the ARC:
{
'id': 'Mercury_7220990',
'question': 'Which factor will most likely cause a person to develop a fever?',
'choices':
{
'text':['a leg muscle relaxing after exercise',
'a bacterial population in the bloodstream',
'several viral particles on the skin',
'carbohydrates being digested in the stomach'],
'label': ['A', 'B', 'C', 'D']
},
'answerKey': 'B'
}
Returns:
{'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
"""
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)
full_prompt = self.prompt_template.format(query=context)
return self.gen_prompt_data(full_prompt)
def get_gold_answer(self, input_d: dict) -> str:
# Get the gold choice
return input_d.get('answerKey', '')
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
"""
Parse the model output to get the answer. Could be the best choice index.
Args:
result: Predicted answer from the model. Usually a string for chat.
raw_input_d (dict): The raw input. Depending on the dataset.
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
Returns:
The parsed answer. Depending on the dataset. Usually a string for chat.
"""
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
return result
else:
return ResponseParser.parse_first_option(text=result, options=self.choices)
def match(self, gold: str, pred: str) -> float:
return exact_match(gold=gold, pred=pred)
@classmethod
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
example: str = input_d['question']
choices_texts: list = input_d['choices']['text']
choices_labels: list = input_d['choices']['label']
choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
example += '\n' + choices_prompts
if include_answer:
example += '\nAnswer:'
example += ' {}\n\n'.format(input_d['answerKey'])
return example