evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/ifeval/ifeval_adapter.py

55 lines
2.0 KiB
Python

from collections import defaultdict
from typing import Any, Dict, List
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType
from evalscope.metrics import Metric, mean, metric_registry
@Benchmark.register(
name='ifeval',
pretty_name='IFEval',
tags=['Instruction-Following'],
description=
'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
dataset_id='opencompass/ifeval',
subset_list=['default'],
metric_list=[
'prompt_level_strict_acc',
'inst_level_strict_acc',
'prompt_level_loose_acc',
'inst_level_loose_acc',
],
few_shot_num=0,
train_split=None,
eval_split='train',
prompt_template='',
)
class IFEvalAdapter(DataAdapter):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# register metrics
metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
return self.gen_prompt_data(input_d['prompt'])
def get_gold_answer(self, input_d: dict) -> str:
return input_d
def match(self, gold: Any, pred: Any) -> Dict:
from evalscope.benchmarks.ifeval.utils import process_results
return process_results(gold, [pred])
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
# aggregate review results
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
return super().compute_metric(res_dict, **kwargs)