import json import os import pandas as pd from collections import defaultdict from dataclasses import asdict, dataclass, field from typing import Any, Dict, List, Union from evalscope.metrics import macro_mean, micro_mean from evalscope.utils import get_logger logger = get_logger() ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下: 1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分 2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格 3. 只列出报告本身,不要有其他多余内容 4. 输出报告语言为{language} ```json {report_str} ``` """ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]: """ Normalize score. Args: score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678} keep_num: number of digits to keep. Returns: Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234} """ if isinstance(score, float): score = round(score, keep_num) elif isinstance(score, dict): score = {k: round(v, keep_num) for k, v in score.items()} else: logger.warning(f'Unknown score type: {type(score)}') return score @dataclass class Subset: name: str = 'default_subset' score: float = 0.0 num: int = 0 def __post_init__(self): self.score = normalize_score(self.score) @dataclass class Category: name: tuple[str] = field(default_factory=tuple) num: int = 0 score: float = 0.0 macro_score: float = 0.0 subsets: List[Subset] = field(default_factory=list) def __post_init__(self): if isinstance(self.name, str): # ensure name is tuple format self.name = (self.name, ) self.num = sum(subset.num for subset in self.subsets) self.score = normalize_score(micro_mean(self.subsets)) self.macro_score = normalize_score(macro_mean(self.subsets)) @classmethod def from_dict(cls, data: dict): subsets = [Subset(**subset) for subset in data.get('subsets', [])] return cls(name=data['name'], subsets=subsets) @dataclass class Metric: name: str = 'default_metric' num: int = 0 score: float = 0.0 macro_score: float = 0.0 categories: List[Category] = field(default_factory=list) def __post_init__(self): self.num = sum(category.num for category in self.categories) self.score = normalize_score(micro_mean(self.categories)) self.macro_score = normalize_score(macro_mean(self.categories)) @classmethod def from_dict(cls, data: dict): categories = [Category.from_dict(category) for category in data.get('categories', [])] return cls(name=data['name'], categories=categories) class ReportKey: model_name = 'Model' dataset_name = 'Dataset' metric_name = 'Metric' category_name = 'Category' category_prefix = 'Cat.' subset_name = 'Subset' num = 'Num' score = 'Score' @dataclass class Report: name: str = 'default_report' dataset_name: str = 'default_dataset' dataset_pretty_name: str = '' dataset_description: str = '' model_name: str = 'default_model' score: float = 0.0 metrics: List[Metric] = field(default_factory=list) analysis: str = 'N/A' def __post_init__(self): self.score = self.metrics[0].score # NOTE: only use the first metric by default def to_dict(self) -> Dict[str, Any]: return asdict(self) def to_json_str(self) -> str: return json.dumps(self.to_dict(), indent=4, ensure_ascii=False) def to_json(self, json_file: str): # ensure the directory exists os.makedirs(os.path.dirname(json_file), exist_ok=True) # write the report to a json file with open(json_file, 'w', encoding='utf-8') as f: json.dump(self.to_dict(), f, indent=4, ensure_ascii=False) @classmethod def from_dict(cls, data: dict): metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])] return cls( name=data['name'], dataset_name=data['dataset_name'], dataset_pretty_name=data.get('dataset_pretty_name'), dataset_description=data.get('dataset_description'), score=data['score'], model_name=data['model_name'], metrics=metrics, analysis=data.get('analysis', 'N/A'), ) @classmethod def from_json(cls, json_file: str): with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) return cls.from_dict(data) def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True, add_overall_metric: bool = False) -> pd.DataFrame: """ Convert the report to a pandas DataFrame. Args: flatten_metrics (bool): Whether to flatten the metrics to a single row. flatten_categories (bool): Whether to flatten the categories to multiple rows. add_overall_metric (bool): Whether to add an overall metric row. Returns: pd.DataFrame: The report as a pandas DataFrame. """ table = defaultdict(list) for metric in self.metrics: metric_count = 0 for category in metric.categories: for subset in category.subsets: metric_count += 1 table[ReportKey.model_name].append(self.model_name) table[ReportKey.dataset_name].append(self.dataset_name) table[ReportKey.metric_name].append(metric.name) table[ReportKey.category_name].append(category.name) table[ReportKey.subset_name].append(subset.name) table[ReportKey.num].append(subset.num) table[ReportKey.score].append(subset.score) # add overall metric when there are multiple subsets if metric_count > 1 and add_overall_metric: table[ReportKey.model_name].append(self.model_name) table[ReportKey.dataset_name].append(self.dataset_name) table[ReportKey.metric_name].append(metric.name) table[ReportKey.category_name].append(('-', )) table[ReportKey.subset_name].append('OVERALL') table[ReportKey.num].append(metric.num) table[ReportKey.score].append(metric.score) # NOTE: only flatten metrics if needed, use the first metric by default if not flatten_metrics: break df = pd.DataFrame.from_dict(table, orient='columns') if flatten_categories: df = self._flatten_categories(df) return df def _flatten_categories(self, df: pd.DataFrame): # expand categories to multiple rows df_categories = df.copy() # multi-level aggregation for categories max_depth = df_categories[ReportKey.category_name].apply(len).max() for level in range(max_depth): df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply( lambda x: x[level] if len(x) > level else None) df_categories.drop(columns=[ReportKey.category_name], inplace=True) return df_categories def generate_analysis(self, judge_llm_config: dict) -> str: import locale from evalscope.metrics import LLMJudge try: # get the default locale lang, _ = locale.getlocale() if lang is None: language = '中文' else: language = 'en' if lang.startswith('en') else '中文' prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str()) judge_llm = LLMJudge(**judge_llm_config) response = judge_llm(prompt) except Exception as e: logger.error(f'Error generating analysis: {e}') response = 'N/A' self.analysis = response return response