evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/winogrande/winogrande_adapter.py

61 lines
2.2 KiB
Python

from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType, OutputType
from evalscope.metrics import exact_match
from evalscope.metrics.completion_parsers import ResponseParser
@Benchmark.register(
name='winogrande',
pretty_name='Winogrande',
tags=['Reasoning', 'MCQ'],
description=
'Winogrande is a benchmark for evaluating AI models on commonsense reasoning tasks, specifically designed to test the ability to resolve ambiguous pronouns in sentences.', # noqa: E501
dataset_id='AI-ModelScope/winogrande_val',
model_adapter=OutputType.GENERATION,
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
metric_list=['AverageAccuracy'],
few_shot_num=0,
train_split=None,
eval_split='validation',
prompt_template='Question: {query}\nA. {option1}\nB. {option2}\nAnswer:', # noqa: E501
)
class WinograndeAdapter(DataAdapter):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.choices = ['A', 'B']
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
"""
Generate model prompt from input data.
"""
prompt = self.prompt_template.format(
query=input_d['sentence'],
option1=input_d['option1'],
option2=input_d['option2'],
)
return self.gen_prompt_data(prompt)
def get_gold_answer(self, input_d: dict) -> str:
"""
Parse the raw input labels (gold).
"""
answer_index = int(input_d['answer']) - 1
return self.choices[answer_index]
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
"""
Parse the predicted result and extract proper answer.
"""
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
return result
else:
return ResponseParser.parse_first_option_with_choices(result, self.choices)
def match(self, gold: str, pred: str) -> float:
"""
Match the gold answer and the predicted answer.
"""
return exact_match(gold=gold, pred=pred)