evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/tool_bench/utils.py

import json

from evalscope.metrics import compute_rouge_score_one_sample


def evaluate_rougel(cand_list: list, ref_list: list):
    if len(ref_list) == 0:
        return 0
    rouge_score = compute_rouge_score_one_sample(cand_list, ref_list)
    rougel = rouge_score.get('rouge-l-f', 0)

    return rougel


def evaluate_action_em(cand_list: list, ref_list: list):
    if len(ref_list) == 0:
        return 0
    em = 0
    for cand, ref in zip(cand_list, ref_list):
        em += (1 if cand == ref else 0)
    return em / len(cand_list)


def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
    easy_f1 = []
    hard_f1 = []
    f1 = []
    for i in range(len(action_pred)):
        ref_action = action_ref[i]
        pred_action = action_pred[i]

        ref_input = ref_list[i]
        cand_input = cand_list[i]

        if ref_action != pred_action:
            easy_f1.append(0)
            hard_f1.append(0)
            f1.append(0)
        else:
            try:
                ref_input_json = json.loads(ref_input)
                try:
                    cand_input_json = json.loads(cand_input)
                    half_match = 0
                    full_match = 0
                    if ref_input_json == {}:
                        if cand_input_json == {}:
                            easy_f1.append(1)
                            f1.append(1)
                        else:
                            easy_f1.append(0)
                            f1.append(0)
                    else:
                        for k, v in ref_input_json.items():
                            if k in cand_input_json.keys():
                                if cand_input_json[k] == v:
                                    full_match += 1
                                else:
                                    half_match += 1

                        recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
                        precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
                        hard_f1.append((2 * recall * precision) / (recall + precision))
                        f1.append((2 * recall * precision) / (recall + precision))
                except Exception:
                    # cand_input = cand_input.replace("\n","").replace("\"","")
                    # ref_input = cand_input.replace("\n","").replace("\"","")
                    # rouge = Rouge()
                    # rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
                    if ref_input_json == {}:
                        easy_f1.append(0)
                    else:
                        hard_f1.append(0)
                    # hard_f1.append(rouge_score["rouge-l"]["f"])
                    # f1.append(rouge_score["rouge-l"]["f"])
                    f1.append(0)
            except Exception:
                pass

    # 检查列表是否为空，如果为空则返回0
    easy_f1_avg = sum(easy_f1) / len(easy_f1) if easy_f1 else 0
    hard_f1_avg = sum(hard_f1) / len(hard_f1) if hard_f1 else 0
    f1_avg = sum(f1) / len(f1) if f1 else 0

    return easy_f1_avg, hard_f1_avg, f1_avg


def parse_action(text):
    action = 'None'
    action_input = '{}'
    if 'Action Input:' in text:
        input_idx = text.rindex('Action Input:')
        action_input = text[input_idx + len('Action Input:'):].strip()
    else:
        action_input = '{}'

    if 'Action:' in text:
        action_idx = text.rindex('Action:')
        action = text[action_idx + len('Action:'):].strip()
        if 'Action Input:' in action:
            input_idx = action.index('Action Input:')
            action = action[:input_idx].strip()
    else:
        action = 'none'
    return action, action_input


def parse_output(text):
    action, action_input = parse_action(text)
    if action == 'Finish':
        try:
            action_input = json.loads(action_input)
            # print(action_input)
            # print(json.dumps(action_input,indent=2))
            return_type = action_input['return_type']
            if return_type == 'give_answer':
                if 'final_answer' in action_input.keys():
                    answer = str(action_input['final_answer'])
                    if answer.strip() in ['', '.', ',']:
                        answer = 'None'
                else:
                    answer = 'None'
                return 'finish', action, action_input, answer
            else:
                return 'give up', None, None, None
        except Exception:
            return 'give up', None, None, None
    else:
        plan = 'call'
        answer = None
        return plan, action, action_input, answer


def calculate_metrics(data):
    """
    Calculate the metrics for the given data.
    """
    plan_ref = []
    plan_pred = []
    hallu_cases = []
    answer_ref = []
    action_ref = []
    action_input_ref = []
    answer_pred = []
    action_pred = []
    action_input_pred = []
    hallu_pred = 0

    reference = data['target']
    prediction = data['predictions']
    ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
    # ref_plan: call
    # ref_action: spott
    # ref_input: {"is_id": "city center" }
    # ref_ans: None

    pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
    if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
        return {}
    if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in data['tools']]:
        return {}

    if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in data['tools']]:
        hallu_pred += 1
        hallu_cases.append(data)

    plan_ref.append(ref_plan)
    plan_pred.append(pred_plan)
    if ref_plan == 'give up':
        pass
    elif ref_plan == 'finish':
        answer_ref.append(ref_ans)
        if pred_ans is None:
            answer_pred.append('none')
        else:
            answer_pred.append(pred_ans)
    else:
        action_ref.append(ref_action)
        action_input_ref.append(ref_input)
        if pred_action is None:
            action_pred.append('none')
        else:
            action_pred.append(pred_action)

        if pred_input is None:
            action_input_pred.append('{}')
        else:
            action_input_pred.append(pred_input)

    metric = {}
    rouge = evaluate_rougel(answer_pred, answer_ref)
    plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
    action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
    easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
    hallu_rate = hallu_pred
    metric['Act.EM'] = action_em
    metric['F1'] = f1
    metric['HalluRate'] = hallu_rate
    metric['plan_em'] = plan_em
    metric['Easy_F1'] = easy_f1
    metric['Hard_F1'] = hard_f1
    metric['Rouge-L'] = rouge
    return metric