evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/tool_bench/tool_bench_adapter.py

76 lines
2.7 KiB
Python

from typing import Dict, List
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType, OutputType
from evalscope.metrics import Metric, mean, metric_registry
@Benchmark.register(
name='tool_bench',
pretty_name='ToolBench-Static',
tags=['Reasoning', 'Agent'],
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
'It includes various subsets such as in-domain and out-of-domain, '
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)', # noqa: E501
dataset_id='AI-ModelScope/ToolBench-Static',
subset_list=['in_domain', 'out_of_domain'],
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
few_shot_num=0,
train_split=None,
eval_split='test',
)
class ToolBenchAdapter(DataAdapter):
def __init__(self, **kwargs):
super().__init__(**kwargs)
metric_registry.register(Metric(name='Rouge-L', object=mean))
metric_registry.register(Metric(name='Act.EM', object=mean))
metric_registry.register(Metric(name='Plan.EM', object=mean))
metric_registry.register(Metric(name='F1', object=mean))
metric_registry.register(Metric(name='HalluRate', object=mean))
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
"""
Generate model prompt from input data.
"""
messages = input_d['messages']
# use prepared messages and remove the name field
for message in messages:
if 'name' in message:
del message['name']
return self.gen_prompt_data(prompt='', messages=messages)
def get_gold_answer(self, input_d: dict) -> str:
"""
Parse the raw input labels (gold).
"""
return input_d
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
"""
Parse the predicted result and extract proper answer.
"""
return result
def match(self, gold: dict, pred: str) -> Dict:
"""
Match the gold answer and the predicted answer.
"""
from .utils import calculate_metrics
data = {
'target': gold['target'],
'predictions': pred,
'tools': gold['tools'],
}
metrics = calculate_metrics(data)
return metrics
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
# aggregate review results
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
return super().compute_metric(res_dict, **kwargs)