55 lines
2.0 KiB
Python
55 lines
2.0 KiB
Python
from collections import defaultdict
|
|
from typing import Any, Dict, List
|
|
|
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.constants import EvalType
|
|
from evalscope.metrics import Metric, mean, metric_registry
|
|
|
|
|
|
@Benchmark.register(
|
|
name='ifeval',
|
|
pretty_name='IFEval',
|
|
tags=['Instruction-Following'],
|
|
description=
|
|
'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
|
|
dataset_id='opencompass/ifeval',
|
|
subset_list=['default'],
|
|
metric_list=[
|
|
'prompt_level_strict_acc',
|
|
'inst_level_strict_acc',
|
|
'prompt_level_loose_acc',
|
|
'inst_level_loose_acc',
|
|
],
|
|
few_shot_num=0,
|
|
train_split=None,
|
|
eval_split='train',
|
|
prompt_template='',
|
|
)
|
|
class IFEvalAdapter(DataAdapter):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
# register metrics
|
|
metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
|
|
metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
|
|
metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
|
|
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
|
|
|
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
return self.gen_prompt_data(input_d['prompt'])
|
|
|
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
return input_d
|
|
|
|
def match(self, gold: Any, pred: Any) -> Dict:
|
|
from evalscope.benchmarks.ifeval.utils import process_results
|
|
|
|
return process_results(gold, [pred])
|
|
|
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
# aggregate review results
|
|
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
|
|
return super().compute_metric(res_dict, **kwargs)
|