evalscope_v0.17.0/evalscope.0.17.0/evalscope/app/utils/data_utils.py

179 lines
6.6 KiB
Python

"""
Data loading and processing utilities for the Evalscope dashboard.
"""
import glob
import numpy as np
import os
import pandas as pd
from typing import Any, Dict, List, Union
from evalscope.constants import DataCollection
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
from evalscope.utils.logger import get_logger
from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
logger = get_logger()
def scan_for_report_folders(root_path):
"""Scan for folders containing reports subdirectories"""
logger.debug(f'Scanning for report folders in {root_path}')
if not os.path.exists(root_path):
return []
reports = []
# Iterate over all folders in the root path
for folder in glob.glob(os.path.join(root_path, '*')):
# Check if reports folder exists
reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
if not os.path.exists(reports_path):
continue
# Iterate over all items in reports folder
for model_item in glob.glob(os.path.join(reports_path, '*')):
if not os.path.isdir(model_item):
continue
datasets = []
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
datasets = DATASET_TOKEN.join(datasets)
reports.append(
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
reports = sorted(reports, reverse=True)
logger.debug(f'reports: {reports}')
return reports
def process_report_name(report_name: str):
prefix, report_name = report_name.split(REPORT_TOKEN)
model_name, datasets = report_name.split(MODEL_TOKEN)
datasets = datasets.split(DATASET_TOKEN)
return prefix, model_name, datasets
def load_single_report(root_path: str, report_name: str):
prefix, model_name, datasets = process_report_name(report_name)
report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
report_list = get_report_list([report_path_list])
config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
if not config_files:
raise FileNotFoundError(
f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}')
task_cfg_path = config_files[0]
task_cfg = yaml_to_dict(task_cfg_path)
return report_list, datasets, task_cfg
def load_multi_report(root_path: str, report_names: List[str]):
report_list = []
for report_name in report_names:
prefix, model_name, datasets = process_report_name(report_name)
report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
reports = get_report_list([report_path_list])
report_list.extend(reports)
return report_list
def get_acc_report_df(report_list: List[Report]):
data_dict = []
for report in report_list:
if report.name == DataCollection.NAME:
for metric in report.metrics:
for category in metric.categories:
item = {
ReportKey.model_name: report.model_name,
ReportKey.dataset_name: '/'.join(category.name),
ReportKey.score: category.score,
ReportKey.num: category.num,
}
data_dict.append(item)
else:
item = {
ReportKey.model_name: report.model_name,
ReportKey.dataset_name: report.dataset_name,
ReportKey.score: report.score,
ReportKey.num: report.metrics[0].num,
}
data_dict.append(item)
df = pd.DataFrame.from_dict(data_dict, orient='columns')
styler = style_df(df, columns=[ReportKey.score])
return df, styler
def style_df(df: pd.DataFrame, columns: List[str] = None):
# Apply background gradient to the specified columns
styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
# Format the dataframe with a precision of 4 decimal places
styler.format(precision=4)
return styler
def get_compare_report_df(acc_df: pd.DataFrame):
df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
df.reset_index(inplace=True)
styler = style_df(df)
return df, styler
def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
df = df[df[ReportKey.dataset_name] == dataset_name]
styler = style_df(df, columns=[ReportKey.score])
return df, styler
def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
for report in report_list:
if report.dataset_name == dataset_name:
return report.analysis
return 'N/A'
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
subset_name = subset_name.replace('/', '_') # for collection report
review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
logger.debug(f'review_path: {review_path}')
origin_df = pd.read_json(review_path, lines=True)
ds = []
for i, item in origin_df.iterrows():
raw_input = item['raw_input']
sample_index = item['index']
for choice_index, choice in enumerate(item['choices']):
raw_pred_answer = choice['message']['content']
parsed_gold_answer = choice['review']['gold']
parsed_pred_answer = choice['review']['pred']
score = choice['review']['result']
raw_d = {
'Index': f'{sample_index}_{choice_index}',
'Input': raw_input,
'Generated': raw_pred_answer if raw_pred_answer != parsed_pred_answer else '*Same as Pred*',
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
'Pred': parsed_pred_answer,
'Score': score,
'NScore': normalize_score(score)
}
ds.append(raw_d)
df_subset = pd.DataFrame(ds)
return df_subset
def normalize_score(score):
try:
if isinstance(score, bool):
return 1.0 if score else 0.0
elif isinstance(score, dict):
for key in score:
return float(score[key])
return 0.0
else:
return float(score)
except (ValueError, TypeError):
return 0.0