76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
from typing import Dict, List
|
|
|
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.constants import EvalType, OutputType
|
|
from evalscope.metrics import Metric, mean, metric_registry
|
|
|
|
|
|
@Benchmark.register(
|
|
name='tool_bench',
|
|
pretty_name='ToolBench-Static',
|
|
tags=['Reasoning', 'Agent'],
|
|
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
|
|
'It includes various subsets such as in-domain and out-of-domain, '
|
|
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
|
|
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)', # noqa: E501
|
|
dataset_id='AI-ModelScope/ToolBench-Static',
|
|
subset_list=['in_domain', 'out_of_domain'],
|
|
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
|
|
few_shot_num=0,
|
|
train_split=None,
|
|
eval_split='test',
|
|
)
|
|
class ToolBenchAdapter(DataAdapter):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
metric_registry.register(Metric(name='Rouge-L', object=mean))
|
|
metric_registry.register(Metric(name='Act.EM', object=mean))
|
|
metric_registry.register(Metric(name='Plan.EM', object=mean))
|
|
metric_registry.register(Metric(name='F1', object=mean))
|
|
metric_registry.register(Metric(name='HalluRate', object=mean))
|
|
|
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
"""
|
|
Generate model prompt from input data.
|
|
"""
|
|
messages = input_d['messages']
|
|
# use prepared messages and remove the name field
|
|
for message in messages:
|
|
if 'name' in message:
|
|
del message['name']
|
|
return self.gen_prompt_data(prompt='', messages=messages)
|
|
|
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
"""
|
|
Parse the raw input labels (gold).
|
|
"""
|
|
return input_d
|
|
|
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
"""
|
|
Parse the predicted result and extract proper answer.
|
|
"""
|
|
return result
|
|
|
|
def match(self, gold: dict, pred: str) -> Dict:
|
|
"""
|
|
Match the gold answer and the predicted answer.
|
|
"""
|
|
from .utils import calculate_metrics
|
|
|
|
data = {
|
|
'target': gold['target'],
|
|
'predictions': pred,
|
|
'tools': gold['tools'],
|
|
}
|
|
metrics = calculate_metrics(data)
|
|
return metrics
|
|
|
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
|
|
# aggregate review results
|
|
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
|
|
return super().compute_metric(res_dict, **kwargs)
|