324 lines
15 KiB
Python
324 lines
15 KiB
Python
"""
|
|
Multi model components for the Evalscope dashboard.
|
|
"""
|
|
import gradio as gr
|
|
import os
|
|
import pandas as pd
|
|
from dataclasses import dataclass
|
|
from typing import TYPE_CHECKING
|
|
|
|
from evalscope.report import ReportKey, get_data_frame
|
|
from evalscope.utils.logger import get_logger
|
|
from ..constants import LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
|
|
from ..utils.data_utils import (get_acc_report_df, get_compare_report_df, get_model_prediction, get_single_dataset_df,
|
|
load_multi_report, load_single_report)
|
|
from ..utils.localization import get_multi_model_locale
|
|
from ..utils.text_utils import convert_markdown_image, process_model_prediction
|
|
from ..utils.visualization import plot_multi_report_radar
|
|
|
|
if TYPE_CHECKING:
|
|
from .sidebar import SidebarComponents
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
@dataclass
|
|
class MultiModelComponents:
|
|
multi_report_name: gr.Dropdown
|
|
|
|
|
|
def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
locale_dict = get_multi_model_locale(lang)
|
|
|
|
multi_report_name = gr.Dropdown(label=locale_dict['select_reports'], choices=[], multiselect=True, interactive=True)
|
|
report_list = gr.State([])
|
|
|
|
with gr.Tab(locale_dict['models_overview']):
|
|
gr.Markdown(locale_dict['model_radar'])
|
|
radar_plot = gr.Plot(value=None)
|
|
gr.Markdown(locale_dict['model_scores'])
|
|
score_table = gr.DataFrame(value=None)
|
|
|
|
with gr.Tab(locale_dict['model_comparison_details']):
|
|
with gr.Row():
|
|
model_a_select = gr.Dropdown(label=locale_dict['select_model_a'], choices=[], interactive=True)
|
|
model_b_select = gr.Dropdown(label=locale_dict['select_model_b'], choices=[], interactive=True)
|
|
|
|
# States to store selected models' information
|
|
model_a_report = gr.State(None)
|
|
model_b_report = gr.State(None)
|
|
model_a_dir = gr.State(None)
|
|
model_b_dir = gr.State(None)
|
|
model_a_name = gr.State(None)
|
|
model_b_name = gr.State(None)
|
|
|
|
dataset_radio = gr.Radio(label=locale_dict['select_dataset'], choices=[], show_label=True, interactive=True)
|
|
|
|
gr.Markdown(f"### {locale_dict['model_predictions']}")
|
|
subset_select = gr.Dropdown(label=locale_dict['select_subset'], choices=[], show_label=True, interactive=True)
|
|
|
|
with gr.Row():
|
|
answer_mode_radio = gr.Radio(
|
|
label=locale_dict.get('answer_mode'),
|
|
choices=['All', 'Pass A & B', 'Fail A & B', 'Pass A, Fail B', 'Fail A, Pass B'],
|
|
value='All',
|
|
interactive=True)
|
|
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
|
|
|
|
data_comparison_df = gr.State(None)
|
|
filtered_comparison_df = gr.State(None)
|
|
|
|
# Statistics row
|
|
with gr.Row(variant='panel'):
|
|
with gr.Column():
|
|
gr.Markdown('### *Counts*')
|
|
comparison_counts = gr.Markdown('')
|
|
with gr.Column():
|
|
page_number = gr.Number(
|
|
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
|
|
|
|
# Input and Gold answer sections remain at the top
|
|
with gr.Row(variant='panel'):
|
|
with gr.Column():
|
|
gr.Markdown('### *Input*')
|
|
input_text = gr.Markdown('', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS)
|
|
|
|
with gr.Column():
|
|
gr.Markdown('### *Gold Answer*')
|
|
gold_text = gr.Markdown('', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS)
|
|
|
|
# Table-like layout for direct comparison
|
|
with gr.Row():
|
|
# Headers for the two models
|
|
with gr.Column(scale=1):
|
|
gr.Markdown('### *Model A*')
|
|
with gr.Column(scale=1):
|
|
gr.Markdown('### *Model B*')
|
|
|
|
# Score comparison row
|
|
with gr.Row():
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Score*')
|
|
model_a_score = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Score*')
|
|
model_b_score = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
|
|
# Normalized score comparison row
|
|
with gr.Row():
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Normalized Score*')
|
|
model_a_nscore = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Normalized Score*')
|
|
model_b_nscore = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
|
|
# Prediction comparison row
|
|
with gr.Row():
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Prediction*')
|
|
model_a_pred = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Prediction*')
|
|
model_b_pred = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
|
|
# Generated output comparison row
|
|
with gr.Row():
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Generated*')
|
|
model_a_generated = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
with gr.Column(scale=1, variant='panel'):
|
|
gr.Markdown('### *Generated*')
|
|
model_b_generated = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS)
|
|
|
|
@multi_report_name.change(
|
|
inputs=[sidebar.root_path, multi_report_name],
|
|
outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select])
|
|
def update_multi_report_data(root_path, multi_report_names):
|
|
if not multi_report_names:
|
|
return gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
|
|
|
|
report_list = load_multi_report(root_path, multi_report_names)
|
|
report_df, _ = get_acc_report_df(report_list)
|
|
report_radar_plot = plot_multi_report_radar(report_df)
|
|
_, styler = get_compare_report_df(report_df)
|
|
|
|
# Extract model names for dropdowns
|
|
model_choices = multi_report_names
|
|
|
|
return report_list, report_radar_plot, styler, gr.update(
|
|
choices=model_choices, value=model_choices[0]), gr.update(
|
|
choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
|
|
|
|
@gr.on(
|
|
triggers=[model_a_select.change, model_b_select.change],
|
|
inputs=[sidebar.root_path, model_a_select, model_b_select],
|
|
outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio])
|
|
def update_selected_models(root_path, model_a, model_b):
|
|
if not model_a or not model_b:
|
|
return gr.skip()
|
|
|
|
# Load individual reports for both models
|
|
model_a_reports, datasets_a, _ = load_single_report(root_path, model_a)
|
|
model_b_reports, datasets_b, _ = load_single_report(root_path, model_b)
|
|
|
|
# Get common datasets
|
|
common_datasets = list(set(datasets_a).intersection(set(datasets_b)))
|
|
|
|
# Extract work directories and model names
|
|
model_a_dir = os.path.join(root_path, model_a.split(REPORT_TOKEN)[0])
|
|
model_b_dir = os.path.join(root_path, model_b.split(REPORT_TOKEN)[0])
|
|
|
|
model_a_name = model_a.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
model_b_name = model_b.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
|
|
return (model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
|
|
gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None))
|
|
|
|
@gr.on(
|
|
triggers=[dataset_radio.change],
|
|
inputs=[dataset_radio, model_a_report, model_b_report],
|
|
outputs=[subset_select, data_comparison_df])
|
|
def update_dataset_comparison(dataset_name, model_a_report, model_b_report):
|
|
if not dataset_name or model_a_report is None or model_b_report is None:
|
|
return gr.skip()
|
|
|
|
# Get dataframes for both models
|
|
report_df_a = get_data_frame(report_list=model_a_report)
|
|
data_score_df_a, _ = get_single_dataset_df(report_df_a, dataset_name)
|
|
|
|
report_df_b = get_data_frame(report_list=model_b_report)
|
|
data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
|
|
|
|
# Get subset choices - should be same for both models
|
|
subsets = data_score_df_a[ReportKey.subset_name].unique().tolist()
|
|
|
|
return gr.update(choices=subsets, value=None), None
|
|
|
|
@gr.on(
|
|
triggers=[subset_select.change],
|
|
inputs=[model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio, subset_select],
|
|
outputs=[data_comparison_df, page_number])
|
|
def update_comparison_data(model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_name, subset_name):
|
|
if not subset_name or not dataset_name:
|
|
return gr.skip()
|
|
|
|
# Get predictions for both models
|
|
df_a = get_model_prediction(model_a_dir, model_a_name, dataset_name, subset_name)
|
|
df_b = get_model_prediction(model_b_dir, model_b_name, dataset_name, subset_name)
|
|
|
|
# Merge dataframes on Input and Gold columns for comparison
|
|
if df_a is not None and df_b is not None:
|
|
# Save the Index column if it exists
|
|
index_a = df_a['Index'].copy()
|
|
index_b = df_b['Index'].copy()
|
|
|
|
df_a = df_a.add_prefix('A_')
|
|
df_b = df_b.add_prefix('B_')
|
|
|
|
# Restore the Index column
|
|
df_a['Index'] = index_a
|
|
df_b['Index'] = index_b
|
|
|
|
# Merge on Index
|
|
comparison_df = pd.merge(df_a, df_b, on='Index')
|
|
|
|
return comparison_df, 1
|
|
|
|
return None, 1
|
|
|
|
@gr.on(
|
|
triggers=[data_comparison_df.change, answer_mode_radio.change, score_threshold.change],
|
|
inputs=[data_comparison_df, answer_mode_radio, score_threshold],
|
|
outputs=[filtered_comparison_df, page_number, comparison_counts])
|
|
def filter_comparison_data(comparison_df, answer_mode, score_threshold):
|
|
if comparison_df is None:
|
|
return None, gr.update(value=1, maximum=1), ''
|
|
|
|
all_count = len(comparison_df)
|
|
|
|
# Apply filtering based on the selected mode and threshold
|
|
if answer_mode == 'Pass A & B':
|
|
filtered_df = comparison_df[(comparison_df['A_NScore'] >= score_threshold)
|
|
& (comparison_df['B_NScore'] >= score_threshold)]
|
|
elif answer_mode == 'Fail A & B':
|
|
filtered_df = comparison_df[(comparison_df['A_NScore'] < score_threshold)
|
|
& (comparison_df['B_NScore'] < score_threshold)]
|
|
elif answer_mode == 'Pass A, Fail B':
|
|
filtered_df = comparison_df[(comparison_df['A_NScore'] >= score_threshold)
|
|
& (comparison_df['B_NScore'] < score_threshold)]
|
|
elif answer_mode == 'Fail A, Pass B':
|
|
filtered_df = comparison_df[(comparison_df['A_NScore'] < score_threshold)
|
|
& (comparison_df['B_NScore'] >= score_threshold)]
|
|
else: # All
|
|
filtered_df = comparison_df
|
|
|
|
# Count statistics
|
|
pass_a_count = len(comparison_df[comparison_df['A_NScore'] >= score_threshold])
|
|
pass_b_count = len(comparison_df[comparison_df['B_NScore'] >= score_threshold])
|
|
pass_both_count = len(comparison_df[(comparison_df['A_NScore'] >= score_threshold)
|
|
& (comparison_df['B_NScore'] >= score_threshold)])
|
|
fail_both_count = len(comparison_df[(comparison_df['A_NScore'] < score_threshold)
|
|
& (comparison_df['B_NScore'] < score_threshold)])
|
|
|
|
counts_text = (f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
|
|
f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}')
|
|
|
|
max_page = max(1, len(filtered_df))
|
|
|
|
return filtered_df, gr.update(value=1, maximum=max_page), counts_text
|
|
|
|
@gr.on(
|
|
triggers=[filtered_comparison_df.change, page_number.change, model_a_select.change, model_b_select.change],
|
|
inputs=[
|
|
filtered_comparison_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name,
|
|
model_b_name
|
|
],
|
|
outputs=[
|
|
input_text, gold_text, model_a_generated, model_a_pred, model_a_score, model_a_nscore, model_b_generated,
|
|
model_b_pred, model_b_score, model_b_nscore
|
|
])
|
|
def update_comparison_display(filtered_df, page_number, score_threshold, model_a_select, model_b_select,
|
|
model_a_name_val, model_b_name_val):
|
|
if filtered_df is None or len(filtered_df) == 0:
|
|
return '', '', '', '', '', '', '', '', '', ''
|
|
|
|
# Get the row for the current page
|
|
start = (page_number - 1)
|
|
if start >= len(filtered_df):
|
|
return '', '', '', '', '', '', '', '', '', ''
|
|
|
|
row = filtered_df.iloc[start]
|
|
|
|
# Process common data
|
|
input_md = process_model_prediction(row['A_Input']) # Use A's input (same as B's)
|
|
gold_md = process_model_prediction(row['A_Gold']) # Use A's gold (same as B's)
|
|
|
|
# Process Model A data
|
|
a_generated_md = process_model_prediction(row['A_Generated'])
|
|
a_pred_md = convert_markdown_image(process_model_prediction(row['A_Pred']))
|
|
a_score_md = process_model_prediction(row['A_Score'])
|
|
a_nscore_val = float(row['A_NScore']) if not pd.isna(row['A_NScore']) else 0.0
|
|
|
|
# Process Model B data
|
|
b_generated_md = process_model_prediction(row['B_Generated'])
|
|
b_pred_md = convert_markdown_image(process_model_prediction(row['B_Pred']))
|
|
b_score_md = process_model_prediction(row['B_Score'])
|
|
b_nscore_val = float(row['B_NScore']) if not pd.isna(row['B_NScore']) else 0.0
|
|
|
|
# Apply visual indicators with backgrounds that make differences more obvious
|
|
if a_nscore_val >= score_threshold:
|
|
a_nscore_html = f"<div style='background-color:rgb(45,104, 62); padding:10px;'>{a_nscore_val}</div>"
|
|
else:
|
|
a_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{a_nscore_val}</div>"
|
|
|
|
if b_nscore_val >= score_threshold:
|
|
b_nscore_html = f"<div style='background-color:rgb(45,104, 62); padding:10px;'>{b_nscore_val}</div>"
|
|
else:
|
|
b_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{b_nscore_val}</div>"
|
|
|
|
return (input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
|
|
b_score_md, b_nscore_html)
|
|
|
|
return MultiModelComponents(multi_report_name=multi_report_name)
|