evalscope_v0.17.0/evalscope.0.17.0/evalscope/metrics/metrics.py

454 lines
14 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright (c) EleutherAI. and its affiliates.
# Copyright (c) OpenAI. and its affiliates.
import itertools
import math
import numpy as np
import random
import sacrebleu
from collections import defaultdict
from collections.abc import Iterable
from typing import TYPE_CHECKING, Dict, List, Union
def mean(arr: list):
if not arr:
return 0.0
if isinstance(arr[0], list):
arr = [item for sublist in arr for item in sublist]
return sum(arr) / len(arr)
def pass_at_k(arr: Union[List[int], List[List[int]]], k: int = 1) -> float:
if not arr:
return 0.0
def sub_pass_at_k(sub_arr: List[int]) -> float:
return 1.0 if any(sub_arr[:k]) else 0.0
if isinstance(arr[0], list):
return sum(sub_pass_at_k(sub_arr) for sub_arr in arr) / len(arr)
else:
return sum(arr) / len(arr)
def pop_stddev(arr):
mu = mean(arr)
return math.sqrt(sum([(x - mu)**2 for x in arr]) / len(arr))
def sample_stddev(arr):
mu = mean(arr)
return math.sqrt(sum([(x - mu)**2 for x in arr]) / (len(arr) - 1))
def mean_stderr(arr):
return sample_stddev(arr) / math.sqrt(len(arr))
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
import sklearn.metrics
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def simple_f1_score(scores: tuple) -> float:
score1 = scores[0]
score2 = scores[1]
score1 = np.mean(score1) if len(score1) > 0 else 0.0
score2 = np.mean(score2) if len(score2) > 0 else 0.0
if score1 == 0 and score2 == 0:
return 0.0
else:
return 2 * score1 * score2 / (score1 + score2)
def f1_score(items):
import sklearn.metrics
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
paragraph_id = doc['idx']['paragraph']
question_id = doc['idx']['question']
if (paragraph_id, question_id) not in question_scoring_dict:
question_scoring_dict[(paragraph_id, question_id)] = []
gold_label = doc['label'] == 1
question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def acc_all_stderr(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc['idx']['question']
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc['label'] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
def weighted_mean(items: List) -> float:
# e.g. [(0,1), (0.5,1), (1,1)]
a, b = zip(*items)
return sum(a) / sum(b)
def micro_mean(items):
try:
return sum([item.score * item.num for item in items]) / sum([item.num for item in items])
except ZeroDivisionError:
return 0.0
def macro_mean(items):
try:
return sum([item.score for item in items]) / len(items)
except ZeroDivisionError:
return 0.0
def weighted_perplexity(items):
return math.exp(-weighted_mean(items))
def bits_per_byte(items):
return -weighted_mean(items) / math.log(2)
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_bleu(preds, refs).score
def bleu_ngram_one_sample(predict, reference):
"""
Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
Args:
items: [(ref, pred)]
Returns:
{
'bleu-1': 0.8,
'bleu-2': 0.45,
'bleu-3': 0.0,
'bleu-4': 0.0
}
"""
import jieba
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
predict = list(jieba.cut(predict)) if is_contains_chinese(predict) else word_tokenize(predict)
reference = [list(jieba.cut(reference))] if is_contains_chinese(reference) else [word_tokenize(reference)]
result = dict()
result['bleu-1'] = sentence_bleu(reference, predict, weights=(1, 0, 0, 0))
result['bleu-2'] = sentence_bleu(reference, predict, weights=(0, 1, 0, 0))
result['bleu-3'] = sentence_bleu(reference, predict, weights=(0, 0, 1, 0))
result['bleu-4'] = sentence_bleu(reference, predict, weights=(0, 0, 0, 1))
return result
def chrf(items):
"""chrF++ is a tool for automatic evaluation of machine translation output
based on character n-gram precision and recall enhanced with word n-grams.
Source: https://github.com/m-popovic/chrF
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_chrf(preds, refs).score
def ter(items):
"""Translation Error Rate is an error metric for machine translation that
measures the number of edits required to change a system output into one
of the references
Source: http://www.cs.umd.edu/~snover/tercom/
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
Lower is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_ter(preds, refs).score
def is_non_str_iterable(obj):
return isinstance(obj, Iterable) and not isinstance(obj, str)
def _sacreformat(refs, preds):
"""Format refs and preds for sacrebleu corpus calculation. It is very particular"""
# Sacrebleu expects (List[str], List[List[str])
# e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
# Note [ref1_stream] is the first reference for each pred.
# So lists are size N and (M, N) for N preds and M possible refs for each pred
# This is a different order of dimensions that I would expect
# We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
# Must become List[List[str]] with the inner list corresponding to preds
if not is_non_str_iterable(refs):
refs = list(refs)
if not is_non_str_iterable(refs[0]):
refs = [[ref] for ref in refs]
refs = list(zip(*refs))
# Note the number of refs in each ref list much match the number of preds
# We expect preds to be List[str] or List[List[str]]. Must become List[str]
if not is_non_str_iterable(preds):
preds = list(preds)
if is_non_str_iterable(preds[0]):
assert len(preds[0]) == 1, f'Pred must be a str, was {preds[0]}'
preds = [pred[0] for pred in preds]
return refs, preds
class _bootstrap_internal:
def __init__(self, f, n):
self.f = f
self.n = n
def __call__(self, v):
i, xs = v
rnd = random.Random()
rnd.seed(i)
res = []
for _ in range(self.n):
res.append(self.f(rnd.choices(xs, k=len(xs))))
return res
def bootstrap_stderr(f, xs, iters):
import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())
# this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
# equivalent to stderr calculated without Bessel's correction in the stddev.
# Unfortunately, I haven't been able to figure out what the right correction is
# to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
# that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
# Thankfully, shouldn't matter because our samples are pretty big usually anyways
res = []
chunk_size = min(1000, iters)
from tqdm import tqdm
print('bootstrapping for stddev:', f.__name__)
for bootstrap in tqdm(
pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)],
),
total=iters // chunk_size,
):
# sample w replacement
res.extend(bootstrap)
pool.close()
return sample_stddev(res)
def stderr_for_metric(metric, bootstrap_iters):
bootstrappable = [
median,
matthews_corrcoef,
f1_score,
perplexity,
bleu,
chrf,
ter,
]
if metric in bootstrappable:
return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
return stderr.get(metric, None)
def yesno(x):
if x:
return 'yes'
else:
return 'no'
def compute_elo(battles,
col_model_a='model_a',
col_model_b='model_b',
col_win='win',
tie_values=['tie', 'tie (bothbad)'],
k=32,
scale=400,
base=10,
init_rating=1000):
rating = defaultdict(lambda: init_rating)
for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
ra = rating[model_a]
rb = rating[model_b]
ea = 1 / (1 + base**((rb - ra) / scale))
eb = 1 / (1 + base**((ra - rb) / scale))
if win == col_model_a:
sa = 1
elif win == col_model_b:
sa = 0
elif win in tie_values:
sa = 0.5
else:
raise Exception(f'unexpected vote {win}')
rating[model_a] += k * (sa - ea)
rating[model_b] += k * (1 - sa - eb)
return rating
def exact_match(gold: str, pred: str) -> float:
if not pred:
return 0
return 1 if gold.strip() == pred.strip() else 0
def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[str, List[str]]) -> float:
"""
Calculate accuracy for ARC benchmark.
Args:
question_answers: question_id -> answer mapping, e.g. {'abc_123': 'A'}
predictions: question_id -> prediction mapping, e.g. {'abc_123': ['D'], 'xyz_456': ['A', 'C']}
Returns:
accuracy score (float)
Notes:
Each question is worth one point. Models are allowed to give multiple answers (e.g., "A;C"),
in which case the model receives 1/N points credit if one of its N answers is correct.
Refer to: https://leaderboard.allenai.org/arc/submissions/get-started
"""
score = 0.0
for question_id, answer in question_answers.items():
try:
predictions_for_q = predictions[question_id]
except Exception as e:
raise KeyError(f'Missing arc prediction: {e}')
if answer in predictions_for_q:
score += 1.0 / len(predictions_for_q)
del predictions[question_id]
if len(predictions) > 0:
log_ex: str = ', '.join(list(predictions.keys())[:3])
raise ValueError(f'Found {len(predictions)} extra predictions, for example: {log_ex}')
return score / len(question_answers)
def calculate_pass_at_k(num_samples: Union[int, List[int], np.ndarray],
num_correct: Union[List[int], np.ndarray],
k: int = 1) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
Examples:
>>> import numpy as np
>>> from typing import Union
>>> total = np.array([5, 5, 5])
>>> correct = np.array([2, 4, 2])
>>> calculate_pass_at_k(total, correct, 1)
result: "array([0.4, 0.8, 0.4])"
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
else:
assert len(num_samples) == len(num_correct)
num_samples_it = iter(num_samples)
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])