evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/tool_bench/utils.py

204 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from evalscope.metrics import compute_rouge_score_one_sample
def evaluate_rougel(cand_list: list, ref_list: list):
if len(ref_list) == 0:
return 0
rouge_score = compute_rouge_score_one_sample(cand_list, ref_list)
rougel = rouge_score.get('rouge-l-f', 0)
return rougel
def evaluate_action_em(cand_list: list, ref_list: list):
if len(ref_list) == 0:
return 0
em = 0
for cand, ref in zip(cand_list, ref_list):
em += (1 if cand == ref else 0)
return em / len(cand_list)
def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
easy_f1 = []
hard_f1 = []
f1 = []
for i in range(len(action_pred)):
ref_action = action_ref[i]
pred_action = action_pred[i]
ref_input = ref_list[i]
cand_input = cand_list[i]
if ref_action != pred_action:
easy_f1.append(0)
hard_f1.append(0)
f1.append(0)
else:
try:
ref_input_json = json.loads(ref_input)
try:
cand_input_json = json.loads(cand_input)
half_match = 0
full_match = 0
if ref_input_json == {}:
if cand_input_json == {}:
easy_f1.append(1)
f1.append(1)
else:
easy_f1.append(0)
f1.append(0)
else:
for k, v in ref_input_json.items():
if k in cand_input_json.keys():
if cand_input_json[k] == v:
full_match += 1
else:
half_match += 1
recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
hard_f1.append((2 * recall * precision) / (recall + precision))
f1.append((2 * recall * precision) / (recall + precision))
except Exception:
# cand_input = cand_input.replace("\n","").replace("\"","")
# ref_input = cand_input.replace("\n","").replace("\"","")
# rouge = Rouge()
# rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
if ref_input_json == {}:
easy_f1.append(0)
else:
hard_f1.append(0)
# hard_f1.append(rouge_score["rouge-l"]["f"])
# f1.append(rouge_score["rouge-l"]["f"])
f1.append(0)
except Exception:
pass
# 检查列表是否为空如果为空则返回0
easy_f1_avg = sum(easy_f1) / len(easy_f1) if easy_f1 else 0
hard_f1_avg = sum(hard_f1) / len(hard_f1) if hard_f1 else 0
f1_avg = sum(f1) / len(f1) if f1 else 0
return easy_f1_avg, hard_f1_avg, f1_avg
def parse_action(text):
action = 'None'
action_input = '{}'
if 'Action Input:' in text:
input_idx = text.rindex('Action Input:')
action_input = text[input_idx + len('Action Input:'):].strip()
else:
action_input = '{}'
if 'Action:' in text:
action_idx = text.rindex('Action:')
action = text[action_idx + len('Action:'):].strip()
if 'Action Input:' in action:
input_idx = action.index('Action Input:')
action = action[:input_idx].strip()
else:
action = 'none'
return action, action_input
def parse_output(text):
action, action_input = parse_action(text)
if action == 'Finish':
try:
action_input = json.loads(action_input)
# print(action_input)
# print(json.dumps(action_input,indent=2))
return_type = action_input['return_type']
if return_type == 'give_answer':
if 'final_answer' in action_input.keys():
answer = str(action_input['final_answer'])
if answer.strip() in ['', '.', ',']:
answer = 'None'
else:
answer = 'None'
return 'finish', action, action_input, answer
else:
return 'give up', None, None, None
except Exception:
return 'give up', None, None, None
else:
plan = 'call'
answer = None
return plan, action, action_input, answer
def calculate_metrics(data):
"""
Calculate the metrics for the given data.
"""
plan_ref = []
plan_pred = []
hallu_cases = []
answer_ref = []
action_ref = []
action_input_ref = []
answer_pred = []
action_pred = []
action_input_pred = []
hallu_pred = 0
reference = data['target']
prediction = data['predictions']
ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
# ref_plan: call
# ref_action: spott
# ref_input: {"is_id": "city center" }
# ref_ans: None
pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
return {}
if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in data['tools']]:
return {}
if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in data['tools']]:
hallu_pred += 1
hallu_cases.append(data)
plan_ref.append(ref_plan)
plan_pred.append(pred_plan)
if ref_plan == 'give up':
pass
elif ref_plan == 'finish':
answer_ref.append(ref_ans)
if pred_ans is None:
answer_pred.append('none')
else:
answer_pred.append(pred_ans)
else:
action_ref.append(ref_action)
action_input_ref.append(ref_input)
if pred_action is None:
action_pred.append('none')
else:
action_pred.append(pred_action)
if pred_input is None:
action_input_pred.append('{}')
else:
action_input_pred.append(pred_input)
metric = {}
rouge = evaluate_rougel(answer_pred, answer_ref)
plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
hallu_rate = hallu_pred
metric['Act.EM'] = action_em
metric['F1'] = f1
metric['HalluRate'] = hallu_rate
metric['plan_em'] = plan_em
metric['Easy_F1'] = easy_f1
metric['Hard_F1'] = hard_f1
metric['Rouge-L'] = rouge
return metric