# Copyright (c) Alibaba, Inc. and its affiliates. import json import os from dataclasses import dataclass from rouge import Rouge @dataclass class EvalArgs: input_path: str output_path: str def run_eval(args: EvalArgs): print(f'*** Start evaluation with eval args: {args}\n') args.input_path = os.path.join(args.input_path, 'predictions.json') args.output_path = os.path.join(args.output_path, 'metrics.json') def evaluate_rougel(cand_list: list, ref_list: list): if len(ref_list) == 0: return 0 rouge = Rouge() rouge_score = rouge.get_scores(hyps=cand_list, refs=ref_list, avg=True) rougel = rouge_score['rouge-l']['f'] return rougel def evaluate_action_em(cand_list: list, ref_list: list): if len(ref_list) == 0: return 0 em = 0 for cand, ref in zip(cand_list, ref_list): em += (1 if cand == ref else 0) return em / len(cand_list) def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list): easy_f1 = [] hard_f1 = [] f1 = [] for i in range(len(action_pred)): ref_action = action_ref[i] pred_action = action_pred[i] ref_input = ref_list[i] cand_input = cand_list[i] if ref_action != pred_action: easy_f1.append(0) hard_f1.append(0) f1.append(0) else: try: ref_input_json = json.loads(ref_input) try: cand_input_json = json.loads(cand_input) half_match = 0 full_match = 0 if ref_input_json == {}: if cand_input_json == {}: easy_f1.append(1) f1.append(1) else: easy_f1.append(0) f1.append(0) else: for k, v in ref_input_json.items(): if k in cand_input_json.keys(): if cand_input_json[k] == v: full_match += 1 else: half_match += 1 recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30) precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30) hard_f1.append((2 * recall * precision) / (recall + precision)) f1.append((2 * recall * precision) / (recall + precision)) except: # cand_input = cand_input.replace("\n","").replace("\"","") # ref_input = cand_input.replace("\n","").replace("\"","") # rouge = Rouge() # rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True) if ref_input_json == {}: easy_f1.append(0) else: hard_f1.append(0) # hard_f1.append(rouge_score["rouge-l"]["f"]) # f1.append(rouge_score["rouge-l"]["f"]) f1.append(0) except: pass return sum(easy_f1) / len(easy_f1) + 1e-30, sum(hard_f1) / len(hard_f1) + 1e-30, sum(f1) / len(f1) + 1e-30 with open(args.input_path, encoding='utf-8') as f: data = json.load(f) def parse_action(text): action = 'None' action_input = '{}' if 'Action Input:' in text: input_idx = text.rindex('Action Input:') action_input = text[input_idx + len('Action Input:'):].strip() else: action_input = '{}' if 'Action:' in text: action_idx = text.rindex('Action:') action = text[action_idx + len('Action:'):].strip() if 'Action Input:' in action: input_idx = action.index('Action Input:') action = action[:input_idx].strip() else: action = 'none' return action, action_input def parse_output(text): action, action_input = parse_action(text) if action == 'Finish': try: action_input = json.loads(action_input) # print(action_input) # print(json.dumps(action_input,indent=2)) return_type = action_input['return_type'] if return_type == 'give_answer': if 'final_answer' in action_input.keys(): answer = str(action_input['final_answer']) if answer.strip() in ['', '.', ',']: answer = 'None' else: answer = 'None' return 'finish', action, action_input, answer else: return 'give up', None, None, None except: return 'give up', None, None, None else: plan = 'call' answer = None return plan, action, action_input, answer plan_ref = [] plan_pred = [] hallu_cases = [] error_cases = [] new_data = [] answer_ref = [] action_ref = [] action_input_ref = [] hallu_ref = 0 answer_pred = [] action_pred = [] action_input_pred = [] hallu_pred = 0 for d in data: reference = d['target'] prediction = d['predictions'] ref_plan, ref_action, ref_input, ref_ans = parse_output(reference) # ref_plan: call # ref_action: spott # ref_input: {"is_id": "city center" } # ref_ans: None pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction) if ref_action is not None and ref_action == 'invalid_hallucination_function_name': continue if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in d['tools']]: continue if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in d['tools']]: hallu_pred += 1 hallu_cases.append(d) plan_ref.append(ref_plan) plan_pred.append(pred_plan) if ref_plan == 'give up': pass elif ref_plan == 'finish': answer_ref.append(ref_ans) if pred_ans is None: answer_pred.append('none') else: answer_pred.append(pred_ans) else: action_ref.append(ref_action) action_input_ref.append(ref_input) if pred_action is None: action_pred.append('none') else: action_pred.append(pred_action) if pred_input is None: action_input_pred.append('{}') else: action_input_pred.append(pred_input) metric = {} rouge = evaluate_rougel(answer_pred, answer_ref) plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref) action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref) easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref) hallu_rate = hallu_pred / len(data) metric['rouge'] = rouge metric['plan_em'] = plan_em metric['action_em'] = action_em metric['easy_f1'] = easy_f1 metric['hard_f1'] = hard_f1 metric['f1'] = f1 metric['hallu_rate'] = hallu_rate if not os.path.exists(os.path.dirname(args.output_path)): os.makedirs(os.path.dirname(args.output_path)) print(metric) with open(args.output_path, 'w', encoding='utf-8') as f: json.dump(metric, f, indent=2) with open(args.output_path.replace('metrics.json', 'hallu_cases.json'), 'w', encoding='utf-8') as f: json.dump(hallu_cases, f, indent=2)