evalscope_v0.17.0/evalscope.0.17.0/evalscope/summarizer.py

138 lines
5.8 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import glob
import json
import os
from typing import List, Union
from evalscope.config import TaskConfig, parse_task_config
from evalscope.constants import EvalBackend
from evalscope.report import gen_table
from evalscope.utils.io_utils import OutputsStructure, csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
from evalscope.utils.logger import get_logger
logger = get_logger()
class Summarizer:
@staticmethod
def get_report(outputs_dir: str) -> List[dict]:
res_list: list = []
outputs_structure = OutputsStructure(outputs_dir, is_make=False)
reports_dir: str = outputs_structure.reports_dir
if reports_dir is None:
raise ValueError(f'No reports directory in {outputs_dir}')
report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
for report_file in report_files:
with open(report_file, 'r') as f:
res_list.append(json.load(f))
report_table: str = gen_table(reports_path_list=[reports_dir])
logger.info(f'*** Report table ***\n{report_table}')
return res_list
@staticmethod
def get_report_from_cfg(task_cfg: Union[str, List[str], TaskConfig, List[TaskConfig], dict]) -> List[dict]:
"""
Get report from cfg file.
Args:
task_cfg: task cfg file path. refer to evalscope/tasks/eval_qwen-7b-chat_v100.yaml
Returns:
list: list of report dict.
A report dict is overall report on a benchmark for specific model.
"""
final_res_list: List[dict] = []
candidate_task_cfgs: List[TaskConfig] = []
if isinstance(task_cfg, list):
for task_cfg_item in task_cfg:
candidate_task_cfgs.append(parse_task_config(task_cfg_item))
else:
candidate_task_cfgs.append(parse_task_config(task_cfg))
for candidate_task in candidate_task_cfgs:
logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
eval_backend = candidate_task.eval_backend
if eval_backend == EvalBackend.NATIVE:
outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
if outputs_dir is None:
raise ValueError(f'No outputs_dir in {task_cfg}')
res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
final_res_list.extend(res_list)
elif eval_backend == EvalBackend.OPEN_COMPASS:
eval_config = Summarizer.parse_eval_config(candidate_task)
work_dir = eval_config.get('work_dir') or 'outputs/default'
if not os.path.exists(work_dir):
raise ValueError(f'work_dir {work_dir} does not exist.')
res_folder_path = get_latest_folder_path(work_dir=work_dir)
summary_files = glob.glob(os.path.join(res_folder_path, 'summary', '*.csv'))
if len(summary_files) == 0:
raise ValueError(f'No summary files in {res_folder_path}')
summary_file_path = summary_files[0]
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
final_res_list.extend(summary_res)
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
eval_config = Summarizer.parse_eval_config(candidate_task)
work_dir = eval_config.get('work_dir') or 'outputs'
if not os.path.exists(work_dir):
raise ValueError(f'work_dir {work_dir} does not exist.')
for model in eval_config['model']:
if model['name'] == 'CustomAPIModel':
model_name = model['type']
else:
model_name = model['name']
csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
summary_files = csv_files + json_files
for summary_file_path in summary_files:
if summary_file_path.endswith('csv'):
summary_res: dict = csv_to_list(summary_file_path)[0]
elif summary_file_path.endswith('json'):
summary_res: dict = json_to_dict(summary_file_path)
base_name = os.path.basename(summary_file_path)
file_name = os.path.splitext(base_name)[0]
final_res_list.append({file_name: summary_res})
elif eval_backend == EvalBackend.THIRD_PARTY:
raise ValueError('*** The summarizer for Third party evaluation backend is not supported yet ***')
else:
raise ValueError(f'Invalid eval_backend: {eval_backend}')
return final_res_list
@staticmethod
def parse_eval_config(candidate_task: TaskConfig):
eval_config: Union[str, dict] = candidate_task.eval_config
assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
if isinstance(eval_config, str):
if eval_config.endswith('.yaml'):
eval_config: dict = yaml_to_dict(eval_config)
elif eval_config.endswith('.json'):
eval_config: dict = json_to_dict(eval_config)
else:
raise ValueError(f'Invalid eval_config: {eval_config}')
return eval_config
if __name__ == '__main__':
cfg_file = 'registry/tasks/eval_qwen-7b-chat_v100.yaml'
report_list = Summarizer.get_report_from_cfg(cfg_file)
print(report_list)