evalscope_v0.17.0/evalscope.0.17.0/evalscope/third_party/toolbench_static/eval.py

219 lines
8.0 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import json
import os
from dataclasses import dataclass
from rouge import Rouge
@dataclass
class EvalArgs:
input_path: str
output_path: str
def run_eval(args: EvalArgs):
print(f'*** Start evaluation with eval args: {args}\n')
args.input_path = os.path.join(args.input_path, 'predictions.json')
args.output_path = os.path.join(args.output_path, 'metrics.json')
def evaluate_rougel(cand_list: list, ref_list: list):
if len(ref_list) == 0:
return 0
rouge = Rouge()
rouge_score = rouge.get_scores(hyps=cand_list, refs=ref_list, avg=True)
rougel = rouge_score['rouge-l']['f']
return rougel
def evaluate_action_em(cand_list: list, ref_list: list):
if len(ref_list) == 0:
return 0
em = 0
for cand, ref in zip(cand_list, ref_list):
em += (1 if cand == ref else 0)
return em / len(cand_list)
def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
easy_f1 = []
hard_f1 = []
f1 = []
for i in range(len(action_pred)):
ref_action = action_ref[i]
pred_action = action_pred[i]
ref_input = ref_list[i]
cand_input = cand_list[i]
if ref_action != pred_action:
easy_f1.append(0)
hard_f1.append(0)
f1.append(0)
else:
try:
ref_input_json = json.loads(ref_input)
try:
cand_input_json = json.loads(cand_input)
half_match = 0
full_match = 0
if ref_input_json == {}:
if cand_input_json == {}:
easy_f1.append(1)
f1.append(1)
else:
easy_f1.append(0)
f1.append(0)
else:
for k, v in ref_input_json.items():
if k in cand_input_json.keys():
if cand_input_json[k] == v:
full_match += 1
else:
half_match += 1
recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
hard_f1.append((2 * recall * precision) / (recall + precision))
f1.append((2 * recall * precision) / (recall + precision))
except:
# cand_input = cand_input.replace("\n","").replace("\"","")
# ref_input = cand_input.replace("\n","").replace("\"","")
# rouge = Rouge()
# rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
if ref_input_json == {}:
easy_f1.append(0)
else:
hard_f1.append(0)
# hard_f1.append(rouge_score["rouge-l"]["f"])
# f1.append(rouge_score["rouge-l"]["f"])
f1.append(0)
except:
pass
return sum(easy_f1) / len(easy_f1) + 1e-30, sum(hard_f1) / len(hard_f1) + 1e-30, sum(f1) / len(f1) + 1e-30
with open(args.input_path, encoding='utf-8') as f:
data = json.load(f)
def parse_action(text):
action = 'None'
action_input = '{}'
if 'Action Input:' in text:
input_idx = text.rindex('Action Input:')
action_input = text[input_idx + len('Action Input:'):].strip()
else:
action_input = '{}'
if 'Action:' in text:
action_idx = text.rindex('Action:')
action = text[action_idx + len('Action:'):].strip()
if 'Action Input:' in action:
input_idx = action.index('Action Input:')
action = action[:input_idx].strip()
else:
action = 'none'
return action, action_input
def parse_output(text):
action, action_input = parse_action(text)
if action == 'Finish':
try:
action_input = json.loads(action_input)
# print(action_input)
# print(json.dumps(action_input,indent=2))
return_type = action_input['return_type']
if return_type == 'give_answer':
if 'final_answer' in action_input.keys():
answer = str(action_input['final_answer'])
if answer.strip() in ['', '.', ',']:
answer = 'None'
else:
answer = 'None'
return 'finish', action, action_input, answer
else:
return 'give up', None, None, None
except:
return 'give up', None, None, None
else:
plan = 'call'
answer = None
return plan, action, action_input, answer
plan_ref = []
plan_pred = []
hallu_cases = []
error_cases = []
new_data = []
answer_ref = []
action_ref = []
action_input_ref = []
hallu_ref = 0
answer_pred = []
action_pred = []
action_input_pred = []
hallu_pred = 0
for d in data:
reference = d['target']
prediction = d['predictions']
ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
# ref_plan: call
# ref_action: spott
# ref_input: {"is_id": "city center" }
# ref_ans: None
pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
continue
if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in d['tools']]:
continue
if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in d['tools']]:
hallu_pred += 1
hallu_cases.append(d)
plan_ref.append(ref_plan)
plan_pred.append(pred_plan)
if ref_plan == 'give up':
pass
elif ref_plan == 'finish':
answer_ref.append(ref_ans)
if pred_ans is None:
answer_pred.append('none')
else:
answer_pred.append(pred_ans)
else:
action_ref.append(ref_action)
action_input_ref.append(ref_input)
if pred_action is None:
action_pred.append('none')
else:
action_pred.append(pred_action)
if pred_input is None:
action_input_pred.append('{}')
else:
action_input_pred.append(pred_input)
metric = {}
rouge = evaluate_rougel(answer_pred, answer_ref)
plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
hallu_rate = hallu_pred / len(data)
metric['rouge'] = rouge
metric['plan_em'] = plan_em
metric['action_em'] = action_em
metric['easy_f1'] = easy_f1
metric['hard_f1'] = hard_f1
metric['f1'] = f1
metric['hallu_rate'] = hallu_rate
if not os.path.exists(os.path.dirname(args.output_path)):
os.makedirs(os.path.dirname(args.output_path))
print(metric)
with open(args.output_path, 'w', encoding='utf-8') as f:
json.dump(metric, f, indent=2)
with open(args.output_path.replace('metrics.json', 'hallu_cases.json'), 'w', encoding='utf-8') as f:
json.dump(hallu_cases, f, indent=2)