227 lines
7.0 KiB
Python
227 lines
7.0 KiB
Python
import inspect
|
|
import math
|
|
import numpy as np
|
|
import pandas as pd
|
|
import re
|
|
from collections import defaultdict
|
|
from sklearn.linear_model import LogisticRegression
|
|
from tqdm import tqdm
|
|
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
def process_review_item(review_item: dict) -> dict:
|
|
"""
|
|
Process a single review item to extract relevant information.
|
|
|
|
Args:
|
|
review_item (dict): The review item to process.
|
|
|
|
Returns:
|
|
dict: Processed review item with necessary information.
|
|
"""
|
|
res = []
|
|
raw_input = review_item['raw_input']
|
|
sample_index = review_item['index']
|
|
question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
|
|
# Find the first non-empty question key in raw_input
|
|
question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
|
|
for choice_index, choice in enumerate(review_item['choices']):
|
|
raw_pred_answer = choice['message']['content']
|
|
parsed_gold_answer = choice['review']['gold']
|
|
parsed_pred_answer = choice['review']['pred']
|
|
score = choice['review']['result']
|
|
raw_d = {
|
|
'Index': f'{sample_index}_{choice_index}',
|
|
'Input': raw_input,
|
|
'Question': question if question else '*No Question*',
|
|
'Generated': raw_pred_answer,
|
|
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
'Pred': parsed_pred_answer,
|
|
'Score': score,
|
|
}
|
|
res.append(raw_d)
|
|
|
|
return res
|
|
|
|
|
|
def post_process_result(completion):
|
|
result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
|
|
if result:
|
|
return result[0]
|
|
else:
|
|
return None
|
|
|
|
|
|
def get_judge_score(result, reverse=False):
|
|
"""
|
|
Calculate the judge score, considering confidence weight.
|
|
|
|
Args:
|
|
result: Judgment result ('A=B', 'A>B', 'A>>B', 'B>A', 'B>>A')
|
|
reverse: Whether to reverse the score
|
|
|
|
Returns:
|
|
float: Weighted score
|
|
"""
|
|
|
|
# Base score mapping - using finer-grained scores
|
|
if not reverse:
|
|
score_mapping = {
|
|
'A=B': 0.5, # Tie
|
|
'A>B': 0.75, # A slightly wins
|
|
'A>>B': 1.0, # A significantly wins
|
|
'B>A': 0.25, # B slightly wins
|
|
'B>>A': 0.0, # B significantly wins
|
|
}
|
|
else:
|
|
score_mapping = {
|
|
'A=B': 0.5, # Tie
|
|
'A>B': 0.25, # A slightly wins
|
|
'A>>B': 0.0, # A significantly wins
|
|
'B>A': 0.75, # B slightly wins
|
|
'B>>A': 1.0, # B significantly wins
|
|
}
|
|
|
|
base_score = score_mapping.get(result, 0.5)
|
|
|
|
return base_score
|
|
|
|
|
|
def get_battles_from_row(row, first_game_only=False, multiplier=3):
|
|
results = []
|
|
|
|
game = row['games'][0]
|
|
output = {'model_a': game['model_a'], 'model_b': game['model_b']}
|
|
|
|
weight = 1
|
|
if game['judgment'] == 'A=B':
|
|
output['winner'] = 'tie'
|
|
elif game['judgment'] == 'A>B':
|
|
output['winner'] = 'model_a'
|
|
elif game['judgment'] == 'A>>B':
|
|
output['winner'] = 'model_a'
|
|
weight = multiplier
|
|
elif game['judgment'] == 'B>A':
|
|
output['winner'] = 'model_b'
|
|
elif game['judgment'] == 'B>>A':
|
|
output['winner'] = 'model_b'
|
|
weight = multiplier
|
|
else:
|
|
weight = 0
|
|
|
|
if weight:
|
|
results += [output] * weight
|
|
|
|
if first_game_only:
|
|
return pd.DataFrame(results)
|
|
|
|
# Dont change the order of model_a and model_b
|
|
output = {'model_a': game['model_a'], 'model_b': game['model_b']}
|
|
|
|
# game 2
|
|
game = row['games'][1]
|
|
|
|
weight = 1
|
|
if game['judgment'] == 'A=B':
|
|
output['winner'] = 'tie'
|
|
elif game['judgment'] == 'A>B':
|
|
output['winner'] = 'model_b'
|
|
elif game['judgment'] == 'A>>B':
|
|
output['winner'] = 'model_b'
|
|
weight = multiplier
|
|
elif game['judgment'] == 'B>A':
|
|
output['winner'] = 'model_a'
|
|
elif game['judgment'] == 'B>>A':
|
|
output['winner'] = 'model_a'
|
|
weight = multiplier
|
|
else:
|
|
weight = 0
|
|
|
|
if weight:
|
|
results += [output] * weight
|
|
|
|
return pd.DataFrame(results)
|
|
|
|
|
|
def compute_mle_elo(df, scale=400, base=10, init_rating=1000, baseline_model='gpt4-0314'):
|
|
models = pd.concat([df['model_a'], df['model_b']]).unique()
|
|
models = pd.Series(np.arange(len(models)), index=models)
|
|
|
|
# duplicate battles
|
|
df = pd.concat([df, df], ignore_index=True)
|
|
p = len(models.index)
|
|
n = df.shape[0]
|
|
|
|
X = np.zeros([n, p])
|
|
X[np.arange(n), models[df['model_a']]] = +math.log(base)
|
|
X[np.arange(n), models[df['model_b']]] = -math.log(base)
|
|
|
|
# one A win => two A win
|
|
Y = np.zeros(n)
|
|
Y[df['winner'] == 'model_a'] = 1.0
|
|
|
|
# one tie => one A win + one B win
|
|
# find tie + tie (both bad) index
|
|
tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
|
|
tie_idx[len(tie_idx) // 2:] = False
|
|
Y[tie_idx] = 1.0
|
|
|
|
if len(np.unique(Y)) < 2:
|
|
logger.info('Warning: Only one class in the data')
|
|
elo_scores = pd.Series(init_rating, index=models.index)
|
|
if np.all(Y == 1.0):
|
|
elo_scores[df['model_a'].iloc[0]] += scale # Boost the winning model
|
|
elif np.all(Y == 0.0):
|
|
elo_scores[df['model_b'].iloc[0]] += scale # Boost the winning model
|
|
return elo_scores.sort_values(ascending=False)
|
|
|
|
lr = LogisticRegression(
|
|
fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model
|
|
lr.fit(X, Y)
|
|
|
|
elo_scores = scale * lr.coef_[0] + init_rating
|
|
|
|
# set anchor 1000
|
|
if baseline_model in models.index:
|
|
elo_scores += 1000 - elo_scores[models[baseline_model]]
|
|
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
|
|
|
|
|
def get_bootstrap_result(battles, func_compute_elo, num_round, baseline_model='gpt-4-0314'):
|
|
rows = []
|
|
kwargs = {}
|
|
if 'baseline_model' in inspect.signature(func_compute_elo).parameters:
|
|
kwargs['baseline_model'] = baseline_model
|
|
for _ in tqdm(range(num_round), desc='bootstrap'):
|
|
res = func_compute_elo(battles.sample(frac=1.0, replace=True), **kwargs)
|
|
if res is not None:
|
|
rows.append(res)
|
|
df = pd.DataFrame(rows)
|
|
return df[df.median().sort_values(ascending=False).index]
|
|
|
|
|
|
def predict_win_rate(elo_ratings, scale=400, base=10, init_rating=1000):
|
|
names = sorted(list(elo_ratings.keys()))
|
|
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
|
for a in names:
|
|
for b in names:
|
|
ea = 1 / (1 + base**((elo_ratings[b] - elo_ratings[a]) / scale))
|
|
wins[a][b] = ea
|
|
wins[b][a] = 1 - ea
|
|
|
|
data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
|
|
|
|
df = pd.DataFrame(data, index=names)
|
|
df.index.name = 'model_a'
|
|
df.columns.name = 'model_b'
|
|
return df.T
|
|
|
|
|
|
def get_win_rate_column(df, column, baseline='gpt4-0314'):
|
|
to_dict = df[['model', column]].set_index('model').to_dict()[column]
|
|
win_rate_table = predict_win_rate(to_dict)
|
|
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
|