105 lines
4.2 KiB
Python
105 lines
4.2 KiB
Python
import pandas as pd
|
|
from pandas import DataFrame
|
|
from typing import TYPE_CHECKING
|
|
|
|
from evalscope.constants import DataCollection
|
|
from evalscope.report.utils import *
|
|
|
|
if TYPE_CHECKING:
|
|
from evalscope.benchmarks import DataAdapter
|
|
|
|
|
|
class ReportGenerator:
|
|
|
|
@staticmethod
|
|
def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
|
|
"""
|
|
Generate a report for a specific dataset based on provided subset scores.
|
|
|
|
Args:
|
|
subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
|
|
{
|
|
'subset_name': [
|
|
{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
|
|
{'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
|
|
],
|
|
...
|
|
}
|
|
report_name (str): The name of the report to generate.
|
|
data_adapter (DataAdapter): An adapter object for data handling.
|
|
|
|
Returns:
|
|
Report: A structured report object containing metrics, categories, and subsets.
|
|
|
|
>>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
|
|
""" # noqa: E501
|
|
|
|
dataset_name = data_adapter.name
|
|
category_map = data_adapter.category_map
|
|
report_name = f'{model_name}@{dataset_name}'
|
|
|
|
def flatten_subset() -> DataFrame:
|
|
"""
|
|
Flatten subset score map to a DataFrame.
|
|
|
|
Example:
|
|
name score num categories metric_name
|
|
0 ARC-Easy 0.5 2 [default] AverageAccuracy
|
|
1 ARC-Challenge 0.5 2 [default] AverageAccuracy
|
|
"""
|
|
subsets = []
|
|
for subset_name, scores in subset_score_map.items():
|
|
for score_item in scores:
|
|
categories = category_map.get(subset_name, ['default'])
|
|
if isinstance(categories, str):
|
|
categories = [categories]
|
|
subsets.append(
|
|
dict(
|
|
name=subset_name,
|
|
score=score_item['score'],
|
|
num=score_item['num'],
|
|
metric_name=score_item['metric_name'],
|
|
categories=tuple(categories)))
|
|
df = pd.DataFrame(subsets)
|
|
return df
|
|
|
|
df = flatten_subset()
|
|
|
|
metrics_list = []
|
|
for metric_name, group_metric in df.groupby('metric_name', sort=False):
|
|
categories = []
|
|
for category_name, group_category in group_metric.groupby('categories'):
|
|
subsets = []
|
|
for _, row in group_category.iterrows():
|
|
subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
|
|
|
|
categories.append(Category(name=category_name, subsets=subsets))
|
|
|
|
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
|
|
report = Report(
|
|
name=report_name,
|
|
metrics=metrics_list,
|
|
dataset_name=dataset_name,
|
|
model_name=model_name,
|
|
dataset_description=data_adapter.description,
|
|
dataset_pretty_name=data_adapter.pretty_name)
|
|
return report
|
|
|
|
@staticmethod
|
|
def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
|
|
categories = []
|
|
for category_name, group_category in df.groupby('categories'):
|
|
subsets = []
|
|
for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
|
|
avg_score = group_subset['score'].mean()
|
|
num = group_subset['score'].count()
|
|
subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
|
|
|
|
categories.append(Category(name=category_name, subsets=subsets))
|
|
return Report(
|
|
name=DataCollection.NAME,
|
|
metrics=[Metric(name='Average', categories=categories)],
|
|
dataset_name=all_dataset_name,
|
|
model_name=model_name)
|