134 lines
4.5 KiB
Python
134 lines
4.5 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
import jieba
|
|
from collections import defaultdict
|
|
from rouge_chinese import Rouge
|
|
from statistics import mean
|
|
from tqdm import tqdm
|
|
|
|
from evalscope.constants import MetricsConstant
|
|
from evalscope.metrics.bundled_rouge_score import rouge_scorer
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
class DummyTokenizer:
|
|
|
|
def tokenize(self, text: str):
|
|
return text.split()
|
|
|
|
|
|
def is_contains_chinese(strs):
|
|
for _char in strs:
|
|
if '\u4e00' <= _char <= '\u9fa5':
|
|
return True
|
|
return False
|
|
|
|
|
|
def compute_rouge_score(predict_l, reference_l):
|
|
assert len(predict_l) == len(reference_l)
|
|
if len(predict_l) == 0:
|
|
tmp_d = dict()
|
|
for key in MetricsConstant.ROUGE_KEYS:
|
|
tmp_d[key] = 0
|
|
return tmp_d
|
|
|
|
result = defaultdict(list)
|
|
for p, r in tqdm(zip(predict_l, reference_l)):
|
|
one_sample = compute_rouge_score_one_sample(p, r)
|
|
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
result[rouge_key].append(one_sample[rouge_key])
|
|
rlt = {}
|
|
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
rlt[rouge_key] = (mean(result[rouge_key]) * 100 if rouge_key in result else MetricsConstant.INVALID_VALUE)
|
|
return rlt
|
|
|
|
|
|
def compute_rouge_score_one_sample_zh(predict, reference):
|
|
result = dict()
|
|
zh_scorer = Rouge()
|
|
for p, r in zip(predict, reference):
|
|
p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
|
|
r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
|
|
|
|
try:
|
|
score = zh_scorer.get_scores(p, r, ignore_empty=True)[0]
|
|
except Exception as e:
|
|
logger.warning(f'rouge score error: {p} {r} {e}')
|
|
continue
|
|
result['Rouge-1-R'] = score['rouge-1']['r']
|
|
result['Rouge-1-P'] = score['rouge-1']['p']
|
|
result['Rouge-1-F'] = score['rouge-1']['f']
|
|
result['Rouge-2-R'] = score['rouge-2']['r']
|
|
result['Rouge-2-P'] = score['rouge-2']['p']
|
|
result['Rouge-2-F'] = score['rouge-2']['f']
|
|
result['Rouge-L-R'] = score['rouge-l']['r']
|
|
result['Rouge-L-P'] = score['rouge-l']['p']
|
|
result['Rouge-L-F'] = score['rouge-l']['f']
|
|
|
|
return result
|
|
|
|
|
|
def compute_rouge_score_one_sample(predict, reference):
|
|
result = dict()
|
|
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
|
|
for p, r in zip(predict, reference):
|
|
try:
|
|
score = scorer.score(p, r)
|
|
except Exception as e:
|
|
logger.warning(f'rouge score error: {p} {r} {e}')
|
|
continue
|
|
result['rouge-1-r'] = score['rouge1'].recall
|
|
result['rouge-1-p'] = score['rouge1'].precision
|
|
result['rouge-1-f'] = score['rouge1'].fmeasure
|
|
result['rouge-2-r'] = score['rouge2'].recall
|
|
result['rouge-2-p'] = score['rouge2'].precision
|
|
result['rouge-2-f'] = score['rouge2'].fmeasure
|
|
result['rouge-l-r'] = score['rougeL'].recall
|
|
result['rouge-l-p'] = score['rougeL'].precision
|
|
result['rouge-l-f'] = score['rougeL'].fmeasure
|
|
|
|
return result
|
|
|
|
|
|
def _to_table(final_result) -> str:
|
|
table = []
|
|
# step 1. table header
|
|
all_tasks = ['', 'total']
|
|
all_tasks.extend(final_result['all_tasks'].split(','))
|
|
table.append('\t'.join(all_tasks))
|
|
|
|
# step 2. table row
|
|
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
row = [rouge_key]
|
|
for task in all_tasks:
|
|
if not task:
|
|
continue
|
|
elif task == 'total':
|
|
row.append(f'{final_result["total"]["rouge"][rouge_key]:0.2f}')
|
|
else:
|
|
row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]:0.2f}')
|
|
table.append('\t'.join(row))
|
|
|
|
return '\n'.join(table)
|
|
|
|
|
|
def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
|
|
print(f"{'#' * md_level} Rouge Eval")
|
|
for data in tqdm(data_l):
|
|
data['rouge'] = compute_rouge_score_one_sample(data['gen_tok_str'], data['reference_tok_str'])
|
|
task_data_d = defaultdict(list)
|
|
for data in data_l:
|
|
for task in data['task_tags']:
|
|
task_data_d[task].append(data)
|
|
|
|
total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
|
|
print(f'[total], count: {len(data_l)}, {report_metric_key}: '
|
|
f'{total_rouge * 100:0.2f}%')
|
|
|
|
for task, task_data in task_data_d.items():
|
|
task_rouge = mean([data['rouge'][report_metric_key] for data in task_data])
|
|
print(f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
|
|
f'{task_rouge * 100:0.2f}%')
|