evalscope_v0.17.0/evalscope.0.17.0/evalscope/app/ui/single_model.py

203 lines
9.2 KiB
Python

"""
Single model components for the Evalscope dashboard.
"""
import gradio as gr
import os
import pandas as pd
from dataclasses import dataclass
from typing import TYPE_CHECKING
from evalscope.report import Report, ReportKey, get_data_frame
from evalscope.utils.logger import get_logger
from ..constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
from ..utils.data_utils import (get_acc_report_df, get_model_prediction, get_report_analysis, get_single_dataset_df,
load_single_report)
from ..utils.localization import get_single_model_locale
from ..utils.text_utils import convert_markdown_image, process_json_content, process_model_prediction
from ..utils.visualization import plot_single_dataset_scores, plot_single_report_scores, plot_single_report_sunburst
if TYPE_CHECKING:
from .sidebar import SidebarComponents
logger = get_logger()
@dataclass
class SingleModelComponents:
report_name: gr.Dropdown
def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
locale_dict = get_single_model_locale(lang)
# Update the UI components with localized labels
report_name = gr.Dropdown(label=locale_dict['select_report'], choices=[], interactive=True)
work_dir = gr.State(None)
model_name = gr.State(None)
with gr.Accordion(locale_dict['task_config'], open=False):
task_config = gr.JSON(value=None)
report_list = gr.State([])
with gr.Tab(locale_dict['datasets_overview']):
gr.Markdown(f'### {locale_dict["dataset_components"]}')
sunburst_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_components'])
gr.Markdown(f'### {locale_dict["dataset_scores"]}')
score_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'])
gr.Markdown(f'### {locale_dict["dataset_scores_table"]}')
score_table = gr.DataFrame(value=None)
with gr.Tab(locale_dict['dataset_details']):
dataset_radio = gr.Radio(label=locale_dict['select_dataset'], choices=[], show_label=True, interactive=True)
# show dataset details
with gr.Accordion(locale_dict['report_analysis'], open=True):
report_analysis = gr.Markdown(value='N/A')
gr.Markdown(f'### {locale_dict["dataset_scores"]}')
dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'])
gr.Markdown(f'### {locale_dict["dataset_scores_table"]}')
dataset_table = gr.DataFrame(value=None)
gr.Markdown(f'### {locale_dict["model_prediction"]}')
subset_select = gr.Dropdown(label=locale_dict['select_subset'], choices=[], show_label=True, interactive=True)
with gr.Row():
answer_mode_radio = gr.Radio(
label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
data_review_df = gr.State(None)
filtered_review_df = gr.State(None)
# show statistics
with gr.Row(variant='panel'):
with gr.Column():
gr.Markdown('### *Counts*')
answer_mode_counts = gr.Markdown('')
with gr.Column():
page_number = gr.Number(
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
# show data review table
with gr.Row(variant='panel'):
with gr.Column():
gr.Markdown('### *Score*')
score_text = gr.Code('', elem_id='score_text', language='json', wrap_lines=False)
with gr.Column():
gr.Markdown('### *Normalized Score*')
nscore = gr.Markdown('', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS)
with gr.Row(variant='panel'):
with gr.Column():
gr.Markdown('### *Gold*')
gold_text = gr.Markdown('', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS)
with gr.Column():
gr.Markdown('### *Pred*')
pred_text = gr.Markdown('', elem_id='pred_text', latex_delimiters=LATEX_DELIMITERS)
with gr.Row(variant='panel'):
with gr.Column():
gr.Markdown('### *Input*')
input_text = gr.Code('', elem_id='input_text', language='json', wrap_lines=False)
with gr.Column():
gr.Markdown('### *Generated*')
generated_text = gr.Markdown('', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS)
@report_name.change(
inputs=[sidebar.root_path, report_name],
outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
def update_single_report_data(root_path, report_name):
report_list, datasets, task_cfg = load_single_report(root_path, report_name)
work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
@report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
def update_single_report_score(report_list):
report_score_df, styler = get_acc_report_df(report_list)
report_score_plot = plot_single_report_scores(report_score_df)
report_sunburst_plot = plot_single_report_sunburst(report_list)
return report_score_plot, styler, report_sunburst_plot
@gr.on(
triggers=[dataset_radio.change, report_list.change],
inputs=[dataset_radio, report_list],
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
def update_single_report_dataset(dataset_name, report_list):
logger.debug(f'Updating single report dataset: {dataset_name}')
report_df = get_data_frame(report_list=report_list)
analysis = get_report_analysis(report_list, dataset_name)
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
data_score_plot = plot_single_dataset_scores(data_score_df)
subsets = data_score_df[ReportKey.subset_name].unique().tolist()
logger.debug(f'subsets: {subsets}')
return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
@gr.on(
triggers=[subset_select.change],
inputs=[work_dir, model_name, dataset_radio, subset_select],
outputs=[data_review_df, page_number])
def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
if not subset_name:
return gr.skip()
data_review_df = get_model_prediction(work_dir, model_name, dataset_name, subset_name)
return data_review_df, 1
@gr.on(
triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
inputs=[data_review_df, answer_mode_radio, score_threshold],
outputs=[filtered_review_df, page_number, answer_mode_counts])
def filter_data(data_review_df, answer_mode, score_threshold):
if data_review_df is None:
return None, gr.update(value=1, maximum=1), ''
all_count = len(data_review_df)
pass_df = data_review_df[data_review_df['NScore'] >= score_threshold]
pass_count = len(pass_df)
fail_count = all_count - pass_count
counts_text = f'### All: {all_count} | Pass: {pass_count} | Fail: {fail_count}'
if answer_mode == 'Pass':
filtered_df = pass_df
elif answer_mode == 'Fail':
filtered_df = data_review_df[data_review_df['NScore'] < score_threshold]
else:
filtered_df = data_review_df
max_page = max(1, len(filtered_df))
return (filtered_df, gr.update(value=1, maximum=max_page), counts_text)
@gr.on(
triggers=[filtered_review_df.change, page_number.change],
inputs=[filtered_review_df, page_number, score_threshold],
outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
def update_table_components(filtered_df, page_number, score_threshold):
if filtered_df is None or len(filtered_df) == 0:
return '', '', '', '', '', ''
# Get single row data for the current page
start = (page_number - 1)
if start >= len(filtered_df):
return '', '', '', '', '', ''
row = filtered_df.iloc[start]
# Process the data for display
input_md = process_json_content(row['Input'])
generated_md = process_model_prediction(row['Generated'])
gold_md = process_model_prediction(row['Gold'])
pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
score_md = process_json_content(row['Score'])
nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
if nscore_val >= score_threshold:
nscore_val = f'<div style="background-color:rgb(45,104, 62); padding:10px;">{nscore_val}</div>'
else:
nscore_val = f'<div style="background-color:rgb(151, 31, 44); padding:10px;">{nscore_val}</div>'
return input_md, generated_md, gold_md, pred_md, score_md, nscore_val
return SingleModelComponents(report_name=report_name)