138 lines
5.8 KiB
Python
138 lines
5.8 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
import glob
|
|
import json
|
|
import os
|
|
from typing import List, Union
|
|
|
|
from evalscope.config import TaskConfig, parse_task_config
|
|
from evalscope.constants import EvalBackend
|
|
from evalscope.report import gen_table
|
|
from evalscope.utils.io_utils import OutputsStructure, csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
class Summarizer:
|
|
|
|
@staticmethod
|
|
def get_report(outputs_dir: str) -> List[dict]:
|
|
res_list: list = []
|
|
|
|
outputs_structure = OutputsStructure(outputs_dir, is_make=False)
|
|
reports_dir: str = outputs_structure.reports_dir
|
|
if reports_dir is None:
|
|
raise ValueError(f'No reports directory in {outputs_dir}')
|
|
|
|
report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
|
|
for report_file in report_files:
|
|
with open(report_file, 'r') as f:
|
|
res_list.append(json.load(f))
|
|
|
|
report_table: str = gen_table(reports_path_list=[reports_dir])
|
|
logger.info(f'*** Report table ***\n{report_table}')
|
|
|
|
return res_list
|
|
|
|
@staticmethod
|
|
def get_report_from_cfg(task_cfg: Union[str, List[str], TaskConfig, List[TaskConfig], dict]) -> List[dict]:
|
|
"""
|
|
Get report from cfg file.
|
|
|
|
Args:
|
|
task_cfg: task cfg file path. refer to evalscope/tasks/eval_qwen-7b-chat_v100.yaml
|
|
|
|
Returns:
|
|
list: list of report dict.
|
|
A report dict is overall report on a benchmark for specific model.
|
|
"""
|
|
final_res_list: List[dict] = []
|
|
candidate_task_cfgs: List[TaskConfig] = []
|
|
|
|
if isinstance(task_cfg, list):
|
|
for task_cfg_item in task_cfg:
|
|
candidate_task_cfgs.append(parse_task_config(task_cfg_item))
|
|
else:
|
|
candidate_task_cfgs.append(parse_task_config(task_cfg))
|
|
|
|
for candidate_task in candidate_task_cfgs:
|
|
logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
|
|
eval_backend = candidate_task.eval_backend
|
|
|
|
if eval_backend == EvalBackend.NATIVE:
|
|
outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
|
|
if outputs_dir is None:
|
|
raise ValueError(f'No outputs_dir in {task_cfg}')
|
|
res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
|
|
final_res_list.extend(res_list)
|
|
|
|
elif eval_backend == EvalBackend.OPEN_COMPASS:
|
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
|
|
work_dir = eval_config.get('work_dir') or 'outputs/default'
|
|
if not os.path.exists(work_dir):
|
|
raise ValueError(f'work_dir {work_dir} does not exist.')
|
|
|
|
res_folder_path = get_latest_folder_path(work_dir=work_dir)
|
|
summary_files = glob.glob(os.path.join(res_folder_path, 'summary', '*.csv'))
|
|
if len(summary_files) == 0:
|
|
raise ValueError(f'No summary files in {res_folder_path}')
|
|
|
|
summary_file_path = summary_files[0]
|
|
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
|
|
summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
|
|
final_res_list.extend(summary_res)
|
|
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
|
|
work_dir = eval_config.get('work_dir') or 'outputs'
|
|
if not os.path.exists(work_dir):
|
|
raise ValueError(f'work_dir {work_dir} does not exist.')
|
|
|
|
for model in eval_config['model']:
|
|
if model['name'] == 'CustomAPIModel':
|
|
model_name = model['type']
|
|
else:
|
|
model_name = model['name']
|
|
|
|
csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
|
|
json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
|
|
|
|
summary_files = csv_files + json_files
|
|
for summary_file_path in summary_files:
|
|
if summary_file_path.endswith('csv'):
|
|
summary_res: dict = csv_to_list(summary_file_path)[0]
|
|
elif summary_file_path.endswith('json'):
|
|
summary_res: dict = json_to_dict(summary_file_path)
|
|
base_name = os.path.basename(summary_file_path)
|
|
file_name = os.path.splitext(base_name)[0]
|
|
final_res_list.append({file_name: summary_res})
|
|
|
|
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
raise ValueError('*** The summarizer for Third party evaluation backend is not supported yet ***')
|
|
else:
|
|
raise ValueError(f'Invalid eval_backend: {eval_backend}')
|
|
|
|
return final_res_list
|
|
|
|
@staticmethod
|
|
def parse_eval_config(candidate_task: TaskConfig):
|
|
eval_config: Union[str, dict] = candidate_task.eval_config
|
|
assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
|
|
|
|
if isinstance(eval_config, str):
|
|
if eval_config.endswith('.yaml'):
|
|
eval_config: dict = yaml_to_dict(eval_config)
|
|
elif eval_config.endswith('.json'):
|
|
eval_config: dict = json_to_dict(eval_config)
|
|
else:
|
|
raise ValueError(f'Invalid eval_config: {eval_config}')
|
|
return eval_config
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cfg_file = 'registry/tasks/eval_qwen-7b-chat_v100.yaml'
|
|
report_list = Summarizer.get_report_from_cfg(cfg_file)
|
|
|
|
print(report_list)
|