71 lines
2.7 KiB
Python
71 lines
2.7 KiB
Python
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.constants import EvalType, OutputType
|
|
from evalscope.metrics import exact_match
|
|
from evalscope.metrics.completion_parsers import ResponseParser
|
|
|
|
|
|
@Benchmark.register(
|
|
name='iquiz',
|
|
pretty_name='IQuiz',
|
|
tags=['Knowledge', 'MCQ', 'Chinese'],
|
|
description=
|
|
'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
|
|
dataset_id='AI-ModelScope/IQuiz',
|
|
model_adapter=OutputType.GENERATION,
|
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
subset_list=['IQ', 'EQ'],
|
|
metric_list=['AverageAccuracy'],
|
|
few_shot_num=0,
|
|
train_split=None,
|
|
eval_split='test',
|
|
system_prompt='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
|
|
)
|
|
class IQuizAdapter(DataAdapter):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
self.choices = ['A', 'B', 'C', 'D', 'E']
|
|
|
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
"""
|
|
Generate model prompt from input data.
|
|
example:
|
|
{
|
|
"question":"天气预报说本周星期三会下雨,昨天果然下雨了,今天星期几?",
|
|
"choices":["星期一","星期二","星期三","星期四"],
|
|
"answer":"D",
|
|
"level":1
|
|
}
|
|
"""
|
|
prompt = f"问题: {input_d['question']}\n"
|
|
prompt += self.__form_options(input_d['choices'])
|
|
return self.gen_prompt_data(prompt)
|
|
|
|
def __form_options(self, options: list):
|
|
option_str = '选项:\n'
|
|
for opt, choice in zip(options, self.choices):
|
|
option_str += f'({choice}): {opt}' + '\n'
|
|
return option_str
|
|
|
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
"""
|
|
Parse the raw input labels (gold).
|
|
"""
|
|
return input_d['answer']
|
|
|
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
"""
|
|
Parse the predicted result and extract proper answer.
|
|
"""
|
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
return result
|
|
else:
|
|
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
|
|
def match(self, gold: str, pred: str) -> float:
|
|
"""
|
|
Match the gold answer and the predicted answer.
|
|
"""
|
|
return exact_match(gold=gold, pred=pred)
|