""" Multi model components for the Evalscope dashboard. """ import gradio as gr import os import pandas as pd from dataclasses import dataclass from typing import TYPE_CHECKING from evalscope.report import ReportKey, get_data_frame from evalscope.utils.logger import get_logger from ..constants import LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN from ..utils.data_utils import (get_acc_report_df, get_compare_report_df, get_model_prediction, get_single_dataset_df, load_multi_report, load_single_report) from ..utils.localization import get_multi_model_locale from ..utils.text_utils import convert_markdown_image, process_model_prediction from ..utils.visualization import plot_multi_report_radar if TYPE_CHECKING: from .sidebar import SidebarComponents logger = get_logger() @dataclass class MultiModelComponents: multi_report_name: gr.Dropdown def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str): locale_dict = get_multi_model_locale(lang) multi_report_name = gr.Dropdown(label=locale_dict['select_reports'], choices=[], multiselect=True, interactive=True) report_list = gr.State([]) with gr.Tab(locale_dict['models_overview']): gr.Markdown(locale_dict['model_radar']) radar_plot = gr.Plot(value=None) gr.Markdown(locale_dict['model_scores']) score_table = gr.DataFrame(value=None) with gr.Tab(locale_dict['model_comparison_details']): with gr.Row(): model_a_select = gr.Dropdown(label=locale_dict['select_model_a'], choices=[], interactive=True) model_b_select = gr.Dropdown(label=locale_dict['select_model_b'], choices=[], interactive=True) # States to store selected models' information model_a_report = gr.State(None) model_b_report = gr.State(None) model_a_dir = gr.State(None) model_b_dir = gr.State(None) model_a_name = gr.State(None) model_b_name = gr.State(None) dataset_radio = gr.Radio(label=locale_dict['select_dataset'], choices=[], show_label=True, interactive=True) gr.Markdown(f"### {locale_dict['model_predictions']}") subset_select = gr.Dropdown(label=locale_dict['select_subset'], choices=[], show_label=True, interactive=True) with gr.Row(): answer_mode_radio = gr.Radio( label=locale_dict.get('answer_mode'), choices=['All', 'Pass A & B', 'Fail A & B', 'Pass A, Fail B', 'Fail A, Pass B'], value='All', interactive=True) score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True) data_comparison_df = gr.State(None) filtered_comparison_df = gr.State(None) # Statistics row with gr.Row(variant='panel'): with gr.Column(): gr.Markdown('### *Counts*') comparison_counts = gr.Markdown('') with gr.Column(): page_number = gr.Number( value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True) # Input and Gold answer sections remain at the top with gr.Row(variant='panel'): with gr.Column(): gr.Markdown('### *Input*') input_text = gr.Markdown('', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS) with gr.Column(): gr.Markdown('### *Gold Answer*') gold_text = gr.Markdown('', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS) # Table-like layout for direct comparison with gr.Row(): # Headers for the two models with gr.Column(scale=1): gr.Markdown('### *Model A*') with gr.Column(scale=1): gr.Markdown('### *Model B*') # Score comparison row with gr.Row(): with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Score*') model_a_score = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Score*') model_b_score = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) # Normalized score comparison row with gr.Row(): with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Normalized Score*') model_a_nscore = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Normalized Score*') model_b_nscore = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) # Prediction comparison row with gr.Row(): with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Prediction*') model_a_pred = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Prediction*') model_b_pred = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) # Generated output comparison row with gr.Row(): with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Generated*') model_a_generated = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) with gr.Column(scale=1, variant='panel'): gr.Markdown('### *Generated*') model_b_generated = gr.Markdown('', latex_delimiters=LATEX_DELIMITERS) @multi_report_name.change( inputs=[sidebar.root_path, multi_report_name], outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select]) def update_multi_report_data(root_path, multi_report_names): if not multi_report_names: return gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip() report_list = load_multi_report(root_path, multi_report_names) report_df, _ = get_acc_report_df(report_list) report_radar_plot = plot_multi_report_radar(report_df) _, styler = get_compare_report_df(report_df) # Extract model names for dropdowns model_choices = multi_report_names return report_list, report_radar_plot, styler, gr.update( choices=model_choices, value=model_choices[0]), gr.update( choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None) @gr.on( triggers=[model_a_select.change, model_b_select.change], inputs=[sidebar.root_path, model_a_select, model_b_select], outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio]) def update_selected_models(root_path, model_a, model_b): if not model_a or not model_b: return gr.skip() # Load individual reports for both models model_a_reports, datasets_a, _ = load_single_report(root_path, model_a) model_b_reports, datasets_b, _ = load_single_report(root_path, model_b) # Get common datasets common_datasets = list(set(datasets_a).intersection(set(datasets_b))) # Extract work directories and model names model_a_dir = os.path.join(root_path, model_a.split(REPORT_TOKEN)[0]) model_b_dir = os.path.join(root_path, model_b.split(REPORT_TOKEN)[0]) model_a_name = model_a.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0] model_b_name = model_b.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0] return (model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name, gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None)) @gr.on( triggers=[dataset_radio.change], inputs=[dataset_radio, model_a_report, model_b_report], outputs=[subset_select, data_comparison_df]) def update_dataset_comparison(dataset_name, model_a_report, model_b_report): if not dataset_name or model_a_report is None or model_b_report is None: return gr.skip() # Get dataframes for both models report_df_a = get_data_frame(report_list=model_a_report) data_score_df_a, _ = get_single_dataset_df(report_df_a, dataset_name) report_df_b = get_data_frame(report_list=model_b_report) data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name) # Get subset choices - should be same for both models subsets = data_score_df_a[ReportKey.subset_name].unique().tolist() return gr.update(choices=subsets, value=None), None @gr.on( triggers=[subset_select.change], inputs=[model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio, subset_select], outputs=[data_comparison_df, page_number]) def update_comparison_data(model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_name, subset_name): if not subset_name or not dataset_name: return gr.skip() # Get predictions for both models df_a = get_model_prediction(model_a_dir, model_a_name, dataset_name, subset_name) df_b = get_model_prediction(model_b_dir, model_b_name, dataset_name, subset_name) # Merge dataframes on Input and Gold columns for comparison if df_a is not None and df_b is not None: # Save the Index column if it exists index_a = df_a['Index'].copy() index_b = df_b['Index'].copy() df_a = df_a.add_prefix('A_') df_b = df_b.add_prefix('B_') # Restore the Index column df_a['Index'] = index_a df_b['Index'] = index_b # Merge on Index comparison_df = pd.merge(df_a, df_b, on='Index') return comparison_df, 1 return None, 1 @gr.on( triggers=[data_comparison_df.change, answer_mode_radio.change, score_threshold.change], inputs=[data_comparison_df, answer_mode_radio, score_threshold], outputs=[filtered_comparison_df, page_number, comparison_counts]) def filter_comparison_data(comparison_df, answer_mode, score_threshold): if comparison_df is None: return None, gr.update(value=1, maximum=1), '' all_count = len(comparison_df) # Apply filtering based on the selected mode and threshold if answer_mode == 'Pass A & B': filtered_df = comparison_df[(comparison_df['A_NScore'] >= score_threshold) & (comparison_df['B_NScore'] >= score_threshold)] elif answer_mode == 'Fail A & B': filtered_df = comparison_df[(comparison_df['A_NScore'] < score_threshold) & (comparison_df['B_NScore'] < score_threshold)] elif answer_mode == 'Pass A, Fail B': filtered_df = comparison_df[(comparison_df['A_NScore'] >= score_threshold) & (comparison_df['B_NScore'] < score_threshold)] elif answer_mode == 'Fail A, Pass B': filtered_df = comparison_df[(comparison_df['A_NScore'] < score_threshold) & (comparison_df['B_NScore'] >= score_threshold)] else: # All filtered_df = comparison_df # Count statistics pass_a_count = len(comparison_df[comparison_df['A_NScore'] >= score_threshold]) pass_b_count = len(comparison_df[comparison_df['B_NScore'] >= score_threshold]) pass_both_count = len(comparison_df[(comparison_df['A_NScore'] >= score_threshold) & (comparison_df['B_NScore'] >= score_threshold)]) fail_both_count = len(comparison_df[(comparison_df['A_NScore'] < score_threshold) & (comparison_df['B_NScore'] < score_threshold)]) counts_text = (f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | ' f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}') max_page = max(1, len(filtered_df)) return filtered_df, gr.update(value=1, maximum=max_page), counts_text @gr.on( triggers=[filtered_comparison_df.change, page_number.change, model_a_select.change, model_b_select.change], inputs=[ filtered_comparison_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name, model_b_name ], outputs=[ input_text, gold_text, model_a_generated, model_a_pred, model_a_score, model_a_nscore, model_b_generated, model_b_pred, model_b_score, model_b_nscore ]) def update_comparison_display(filtered_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name_val, model_b_name_val): if filtered_df is None or len(filtered_df) == 0: return '', '', '', '', '', '', '', '', '', '' # Get the row for the current page start = (page_number - 1) if start >= len(filtered_df): return '', '', '', '', '', '', '', '', '', '' row = filtered_df.iloc[start] # Process common data input_md = process_model_prediction(row['A_Input']) # Use A's input (same as B's) gold_md = process_model_prediction(row['A_Gold']) # Use A's gold (same as B's) # Process Model A data a_generated_md = process_model_prediction(row['A_Generated']) a_pred_md = convert_markdown_image(process_model_prediction(row['A_Pred'])) a_score_md = process_model_prediction(row['A_Score']) a_nscore_val = float(row['A_NScore']) if not pd.isna(row['A_NScore']) else 0.0 # Process Model B data b_generated_md = process_model_prediction(row['B_Generated']) b_pred_md = convert_markdown_image(process_model_prediction(row['B_Pred'])) b_score_md = process_model_prediction(row['B_Score']) b_nscore_val = float(row['B_NScore']) if not pd.isna(row['B_NScore']) else 0.0 # Apply visual indicators with backgrounds that make differences more obvious if a_nscore_val >= score_threshold: a_nscore_html = f"
{a_nscore_val}
" else: a_nscore_html = f"
{a_nscore_val}
" if b_nscore_val >= score_threshold: b_nscore_html = f"
{b_nscore_val}
" else: b_nscore_html = f"
{b_nscore_val}
" return (input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md, b_score_md, b_nscore_html) return MultiModelComponents(multi_report_name=multi_report_name)