evalscope_v0.17.0/evalscope.0.17.0/evalscope/report/utils.py

233 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import pandas as pd
from collections import defaultdict
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, List, Union
from evalscope.metrics import macro_mean, micro_mean
from evalscope.utils import get_logger
logger = get_logger()
ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果输出分析报告要求如下
1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
2. 若模型有多种指标将其分为低分、中分、高分三个部分并列出markdown表格
3. 只列出报告本身,不要有其他多余内容
4. 输出报告语言为{language}
```json
{report_str}
```
"""
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
"""
Normalize score.
Args:
score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
keep_num: number of digits to keep.
Returns:
Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
"""
if isinstance(score, float):
score = round(score, keep_num)
elif isinstance(score, dict):
score = {k: round(v, keep_num) for k, v in score.items()}
else:
logger.warning(f'Unknown score type: {type(score)}')
return score
@dataclass
class Subset:
name: str = 'default_subset'
score: float = 0.0
num: int = 0
def __post_init__(self):
self.score = normalize_score(self.score)
@dataclass
class Category:
name: tuple[str] = field(default_factory=tuple)
num: int = 0
score: float = 0.0
macro_score: float = 0.0
subsets: List[Subset] = field(default_factory=list)
def __post_init__(self):
if isinstance(self.name, str):
# ensure name is tuple format
self.name = (self.name, )
self.num = sum(subset.num for subset in self.subsets)
self.score = normalize_score(micro_mean(self.subsets))
self.macro_score = normalize_score(macro_mean(self.subsets))
@classmethod
def from_dict(cls, data: dict):
subsets = [Subset(**subset) for subset in data.get('subsets', [])]
return cls(name=data['name'], subsets=subsets)
@dataclass
class Metric:
name: str = 'default_metric'
num: int = 0
score: float = 0.0
macro_score: float = 0.0
categories: List[Category] = field(default_factory=list)
def __post_init__(self):
self.num = sum(category.num for category in self.categories)
self.score = normalize_score(micro_mean(self.categories))
self.macro_score = normalize_score(macro_mean(self.categories))
@classmethod
def from_dict(cls, data: dict):
categories = [Category.from_dict(category) for category in data.get('categories', [])]
return cls(name=data['name'], categories=categories)
class ReportKey:
model_name = 'Model'
dataset_name = 'Dataset'
metric_name = 'Metric'
category_name = 'Category'
category_prefix = 'Cat.'
subset_name = 'Subset'
num = 'Num'
score = 'Score'
@dataclass
class Report:
name: str = 'default_report'
dataset_name: str = 'default_dataset'
dataset_pretty_name: str = ''
dataset_description: str = ''
model_name: str = 'default_model'
score: float = 0.0
metrics: List[Metric] = field(default_factory=list)
analysis: str = 'N/A'
def __post_init__(self):
self.score = self.metrics[0].score # NOTE: only use the first metric by default
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
def to_json_str(self) -> str:
return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
def to_json(self, json_file: str):
# ensure the directory exists
os.makedirs(os.path.dirname(json_file), exist_ok=True)
# write the report to a json file
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
@classmethod
def from_dict(cls, data: dict):
metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
return cls(
name=data['name'],
dataset_name=data['dataset_name'],
dataset_pretty_name=data.get('dataset_pretty_name'),
dataset_description=data.get('dataset_description'),
score=data['score'],
model_name=data['model_name'],
metrics=metrics,
analysis=data.get('analysis', 'N/A'),
)
@classmethod
def from_json(cls, json_file: str):
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return cls.from_dict(data)
def to_dataframe(self,
flatten_metrics: bool = True,
flatten_categories: bool = True,
add_overall_metric: bool = False) -> pd.DataFrame:
"""
Convert the report to a pandas DataFrame.
Args:
flatten_metrics (bool): Whether to flatten the metrics to a single row.
flatten_categories (bool): Whether to flatten the categories to multiple rows.
add_overall_metric (bool): Whether to add an overall metric row.
Returns:
pd.DataFrame: The report as a pandas DataFrame.
"""
table = defaultdict(list)
for metric in self.metrics:
metric_count = 0
for category in metric.categories:
for subset in category.subsets:
metric_count += 1
table[ReportKey.model_name].append(self.model_name)
table[ReportKey.dataset_name].append(self.dataset_name)
table[ReportKey.metric_name].append(metric.name)
table[ReportKey.category_name].append(category.name)
table[ReportKey.subset_name].append(subset.name)
table[ReportKey.num].append(subset.num)
table[ReportKey.score].append(subset.score)
# add overall metric when there are multiple subsets
if metric_count > 1 and add_overall_metric:
table[ReportKey.model_name].append(self.model_name)
table[ReportKey.dataset_name].append(self.dataset_name)
table[ReportKey.metric_name].append(metric.name)
table[ReportKey.category_name].append(('-', ))
table[ReportKey.subset_name].append('OVERALL')
table[ReportKey.num].append(metric.num)
table[ReportKey.score].append(metric.score)
# NOTE: only flatten metrics if needed, use the first metric by default
if not flatten_metrics:
break
df = pd.DataFrame.from_dict(table, orient='columns')
if flatten_categories:
df = self._flatten_categories(df)
return df
def _flatten_categories(self, df: pd.DataFrame):
# expand categories to multiple rows
df_categories = df.copy()
# multi-level aggregation for categories
max_depth = df_categories[ReportKey.category_name].apply(len).max()
for level in range(max_depth):
df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
lambda x: x[level] if len(x) > level else None)
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
return df_categories
def generate_analysis(self, judge_llm_config: dict) -> str:
import locale
from evalscope.metrics import LLMJudge
try:
# get the default locale
lang, _ = locale.getlocale()
if lang is None:
language = '中文'
else:
language = 'en' if lang.startswith('en') else '中文'
prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
judge_llm = LLMJudge(**judge_llm_config)
response = judge_llm(prompt)
except Exception as e:
logger.error(f'Error generating analysis: {e}')
response = 'N/A'
self.analysis = response
return response