evalscope_v0.17.0/evalscope.0.17.0/evalscope/metrics/rouge_metric.py

134 lines
4.5 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import jieba
from collections import defaultdict
from rouge_chinese import Rouge
from statistics import mean
from tqdm import tqdm
from evalscope.constants import MetricsConstant
from evalscope.metrics.bundled_rouge_score import rouge_scorer
from evalscope.utils.logger import get_logger
logger = get_logger()
class DummyTokenizer:
def tokenize(self, text: str):
return text.split()
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
def compute_rouge_score(predict_l, reference_l):
assert len(predict_l) == len(reference_l)
if len(predict_l) == 0:
tmp_d = dict()
for key in MetricsConstant.ROUGE_KEYS:
tmp_d[key] = 0
return tmp_d
result = defaultdict(list)
for p, r in tqdm(zip(predict_l, reference_l)):
one_sample = compute_rouge_score_one_sample(p, r)
for rouge_key in MetricsConstant.ROUGE_KEYS:
result[rouge_key].append(one_sample[rouge_key])
rlt = {}
for rouge_key in MetricsConstant.ROUGE_KEYS:
rlt[rouge_key] = (mean(result[rouge_key]) * 100 if rouge_key in result else MetricsConstant.INVALID_VALUE)
return rlt
def compute_rouge_score_one_sample_zh(predict, reference):
result = dict()
zh_scorer = Rouge()
for p, r in zip(predict, reference):
p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
try:
score = zh_scorer.get_scores(p, r, ignore_empty=True)[0]
except Exception as e:
logger.warning(f'rouge score error: {p} {r} {e}')
continue
result['Rouge-1-R'] = score['rouge-1']['r']
result['Rouge-1-P'] = score['rouge-1']['p']
result['Rouge-1-F'] = score['rouge-1']['f']
result['Rouge-2-R'] = score['rouge-2']['r']
result['Rouge-2-P'] = score['rouge-2']['p']
result['Rouge-2-F'] = score['rouge-2']['f']
result['Rouge-L-R'] = score['rouge-l']['r']
result['Rouge-L-P'] = score['rouge-l']['p']
result['Rouge-L-F'] = score['rouge-l']['f']
return result
def compute_rouge_score_one_sample(predict, reference):
result = dict()
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
for p, r in zip(predict, reference):
try:
score = scorer.score(p, r)
except Exception as e:
logger.warning(f'rouge score error: {p} {r} {e}')
continue
result['rouge-1-r'] = score['rouge1'].recall
result['rouge-1-p'] = score['rouge1'].precision
result['rouge-1-f'] = score['rouge1'].fmeasure
result['rouge-2-r'] = score['rouge2'].recall
result['rouge-2-p'] = score['rouge2'].precision
result['rouge-2-f'] = score['rouge2'].fmeasure
result['rouge-l-r'] = score['rougeL'].recall
result['rouge-l-p'] = score['rougeL'].precision
result['rouge-l-f'] = score['rougeL'].fmeasure
return result
def _to_table(final_result) -> str:
table = []
# step 1. table header
all_tasks = ['', 'total']
all_tasks.extend(final_result['all_tasks'].split(','))
table.append('\t'.join(all_tasks))
# step 2. table row
for rouge_key in MetricsConstant.ROUGE_KEYS:
row = [rouge_key]
for task in all_tasks:
if not task:
continue
elif task == 'total':
row.append(f'{final_result["total"]["rouge"][rouge_key]:0.2f}')
else:
row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]:0.2f}')
table.append('\t'.join(row))
return '\n'.join(table)
def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
print(f"{'#' * md_level} Rouge Eval")
for data in tqdm(data_l):
data['rouge'] = compute_rouge_score_one_sample(data['gen_tok_str'], data['reference_tok_str'])
task_data_d = defaultdict(list)
for data in data_l:
for task in data['task_tags']:
task_data_d[task].append(data)
total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
print(f'[total], count: {len(data_l)}, {report_metric_key}: '
f'{total_rouge * 100:0.2f}%')
for task, task_data in task_data_d.items():
task_rouge = mean([data['rouge'][report_metric_key] for data in task_data])
print(f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
f'{task_rouge * 100:0.2f}%')