204 lines
6.9 KiB
Python
204 lines
6.9 KiB
Python
import json
|
||
|
||
from evalscope.metrics import compute_rouge_score_one_sample
|
||
|
||
|
||
def evaluate_rougel(cand_list: list, ref_list: list):
|
||
if len(ref_list) == 0:
|
||
return 0
|
||
rouge_score = compute_rouge_score_one_sample(cand_list, ref_list)
|
||
rougel = rouge_score.get('rouge-l-f', 0)
|
||
|
||
return rougel
|
||
|
||
|
||
def evaluate_action_em(cand_list: list, ref_list: list):
|
||
if len(ref_list) == 0:
|
||
return 0
|
||
em = 0
|
||
for cand, ref in zip(cand_list, ref_list):
|
||
em += (1 if cand == ref else 0)
|
||
return em / len(cand_list)
|
||
|
||
|
||
def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
|
||
easy_f1 = []
|
||
hard_f1 = []
|
||
f1 = []
|
||
for i in range(len(action_pred)):
|
||
ref_action = action_ref[i]
|
||
pred_action = action_pred[i]
|
||
|
||
ref_input = ref_list[i]
|
||
cand_input = cand_list[i]
|
||
|
||
if ref_action != pred_action:
|
||
easy_f1.append(0)
|
||
hard_f1.append(0)
|
||
f1.append(0)
|
||
else:
|
||
try:
|
||
ref_input_json = json.loads(ref_input)
|
||
try:
|
||
cand_input_json = json.loads(cand_input)
|
||
half_match = 0
|
||
full_match = 0
|
||
if ref_input_json == {}:
|
||
if cand_input_json == {}:
|
||
easy_f1.append(1)
|
||
f1.append(1)
|
||
else:
|
||
easy_f1.append(0)
|
||
f1.append(0)
|
||
else:
|
||
for k, v in ref_input_json.items():
|
||
if k in cand_input_json.keys():
|
||
if cand_input_json[k] == v:
|
||
full_match += 1
|
||
else:
|
||
half_match += 1
|
||
|
||
recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
|
||
precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
|
||
hard_f1.append((2 * recall * precision) / (recall + precision))
|
||
f1.append((2 * recall * precision) / (recall + precision))
|
||
except Exception:
|
||
# cand_input = cand_input.replace("\n","").replace("\"","")
|
||
# ref_input = cand_input.replace("\n","").replace("\"","")
|
||
# rouge = Rouge()
|
||
# rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
|
||
if ref_input_json == {}:
|
||
easy_f1.append(0)
|
||
else:
|
||
hard_f1.append(0)
|
||
# hard_f1.append(rouge_score["rouge-l"]["f"])
|
||
# f1.append(rouge_score["rouge-l"]["f"])
|
||
f1.append(0)
|
||
except Exception:
|
||
pass
|
||
|
||
# 检查列表是否为空,如果为空则返回0
|
||
easy_f1_avg = sum(easy_f1) / len(easy_f1) if easy_f1 else 0
|
||
hard_f1_avg = sum(hard_f1) / len(hard_f1) if hard_f1 else 0
|
||
f1_avg = sum(f1) / len(f1) if f1 else 0
|
||
|
||
return easy_f1_avg, hard_f1_avg, f1_avg
|
||
|
||
|
||
def parse_action(text):
|
||
action = 'None'
|
||
action_input = '{}'
|
||
if 'Action Input:' in text:
|
||
input_idx = text.rindex('Action Input:')
|
||
action_input = text[input_idx + len('Action Input:'):].strip()
|
||
else:
|
||
action_input = '{}'
|
||
|
||
if 'Action:' in text:
|
||
action_idx = text.rindex('Action:')
|
||
action = text[action_idx + len('Action:'):].strip()
|
||
if 'Action Input:' in action:
|
||
input_idx = action.index('Action Input:')
|
||
action = action[:input_idx].strip()
|
||
else:
|
||
action = 'none'
|
||
return action, action_input
|
||
|
||
|
||
def parse_output(text):
|
||
action, action_input = parse_action(text)
|
||
if action == 'Finish':
|
||
try:
|
||
action_input = json.loads(action_input)
|
||
# print(action_input)
|
||
# print(json.dumps(action_input,indent=2))
|
||
return_type = action_input['return_type']
|
||
if return_type == 'give_answer':
|
||
if 'final_answer' in action_input.keys():
|
||
answer = str(action_input['final_answer'])
|
||
if answer.strip() in ['', '.', ',']:
|
||
answer = 'None'
|
||
else:
|
||
answer = 'None'
|
||
return 'finish', action, action_input, answer
|
||
else:
|
||
return 'give up', None, None, None
|
||
except Exception:
|
||
return 'give up', None, None, None
|
||
else:
|
||
plan = 'call'
|
||
answer = None
|
||
return plan, action, action_input, answer
|
||
|
||
|
||
def calculate_metrics(data):
|
||
"""
|
||
Calculate the metrics for the given data.
|
||
"""
|
||
plan_ref = []
|
||
plan_pred = []
|
||
hallu_cases = []
|
||
answer_ref = []
|
||
action_ref = []
|
||
action_input_ref = []
|
||
answer_pred = []
|
||
action_pred = []
|
||
action_input_pred = []
|
||
hallu_pred = 0
|
||
|
||
reference = data['target']
|
||
prediction = data['predictions']
|
||
ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
|
||
# ref_plan: call
|
||
# ref_action: spott
|
||
# ref_input: {"is_id": "city center" }
|
||
# ref_ans: None
|
||
|
||
pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
|
||
if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
|
||
return {}
|
||
if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in data['tools']]:
|
||
return {}
|
||
|
||
if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in data['tools']]:
|
||
hallu_pred += 1
|
||
hallu_cases.append(data)
|
||
|
||
plan_ref.append(ref_plan)
|
||
plan_pred.append(pred_plan)
|
||
if ref_plan == 'give up':
|
||
pass
|
||
elif ref_plan == 'finish':
|
||
answer_ref.append(ref_ans)
|
||
if pred_ans is None:
|
||
answer_pred.append('none')
|
||
else:
|
||
answer_pred.append(pred_ans)
|
||
else:
|
||
action_ref.append(ref_action)
|
||
action_input_ref.append(ref_input)
|
||
if pred_action is None:
|
||
action_pred.append('none')
|
||
else:
|
||
action_pred.append(pred_action)
|
||
|
||
if pred_input is None:
|
||
action_input_pred.append('{}')
|
||
else:
|
||
action_input_pred.append(pred_input)
|
||
|
||
metric = {}
|
||
rouge = evaluate_rougel(answer_pred, answer_ref)
|
||
plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
|
||
action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
|
||
easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
|
||
hallu_rate = hallu_pred
|
||
metric['Act.EM'] = action_em
|
||
metric['F1'] = f1
|
||
metric['HalluRate'] = hallu_rate
|
||
metric['plan_em'] = plan_em
|
||
metric['Easy_F1'] = easy_f1
|
||
metric['Hard_F1'] = hard_f1
|
||
metric['Rouge-L'] = rouge
|
||
return metric
|