evalscope_v0.17.0/evalscope.0.17.0/docs/generate_dataset_md.py

272 lines
8.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from tqdm import tqdm
from typing import Any, Dict
from evalscope.benchmarks import DataAdapter
# Language dictionaries for dataset markdown generation
DATASET_DETAIL_LOCALE = {
'back_to_top': {
'zh': '返回目录',
'en': 'Back to Top'
},
'toc_title': {
'zh': 'LLM评测集',
'en': 'LLM Benchmarks'
},
'dataset_name': {
'zh': '数据集名称',
'en': 'Dataset Name'
},
'dataset_id': {
'zh': '数据集ID',
'en': 'Dataset ID'
},
'description': {
'zh': '数据集描述',
'en': 'Description'
},
'task_categories': {
'zh': '任务类别',
'en': 'Task Categories'
},
'evaluation_metrics': {
'zh': '评估指标',
'en': 'Evaluation Metrics'
},
'requires_llm_judge': {
'zh': '需要LLM Judge',
'en': 'Requires LLM Judge'
},
'default_shots': {
'zh': '默认提示方式',
'en': 'Default Shots'
},
'subsets': {
'zh': '数据集子集',
'en': 'Subsets'
},
'supported_output_formats': {
'zh': '支持输出格式',
'en': 'Supported Output Formats'
},
'extra_parameters': {
'zh': '额外参数',
'en': 'Extra Parameters'
},
'system_prompt': {
'zh': '系统提示词',
'en': 'System Prompt'
},
'prompt_template': {
'zh': '提示模板',
'en': 'Prompt Template'
},
'yes': {
'zh': '',
'en': 'Yes'
},
'no': {
'zh': '',
'en': 'No'
},
'no_description': {
'zh': '暂无详细描述',
'en': 'No detailed description available'
}
}
DOCUMENT_LOCALE = {
'title': {
'zh': 'LLM评测集',
'en': 'LLM Benchmarks'
},
'intro': {
'zh': '以下是支持的LLM评测集列表点击数据集标准名称可跳转详细信息。',
'en': 'Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.'
},
'dataset_name': {
'zh': '数据集名称',
'en': 'Benchmark Name'
},
'pretty_name': {
'zh': '标准名称',
'en': 'Pretty Name'
},
'task_categories': {
'zh': '任务类别',
'en': 'Task Categories'
},
'details_title': {
'zh': '数据集详情',
'en': 'Benchmark Details'
}
}
def wrap_key_words(keywords: list[str]) -> str:
"""
将关键词列表转换为Markdown格式的字符串
Args:
keywords (list[str]): 关键词列表
Returns:
str: 格式化的Markdown字符串
"""
# 使用逗号分隔关键词,并添加反引号格式化
return ', '.join(sorted([f'`{keyword}`' for keyword in keywords]))
def process_dictionary(data: dict) -> str:
"""
json.dumps的包装函数处理字典格式化为Markdown代码块
Args:
data (dict): 要格式化的字典
"""
return json.dumps(data, ensure_ascii=False, indent=4)
def get_dataset_detail_locale(lang: str) -> Dict[str, str]:
"""Get localized strings for dataset details"""
return {k: v[lang] for k, v in DATASET_DETAIL_LOCALE.items()}
def get_document_locale(lang: str) -> Dict[str, str]:
"""Get localized strings for document structure"""
return {k: v[lang] for k, v in DOCUMENT_LOCALE.items()}
def generate_dataset_markdown(data_adapter: DataAdapter, lang: str = 'zh') -> str:
"""
Generate a well-formatted Markdown benchmark introduction based on a DataAdapter instance
Args:
data_adapter (DataAdapter): Dataset adapter instance
lang (str): Language code ('zh' for Chinese, 'en' for English)
Returns:
str: Formatted Markdown string
"""
# Get localized text
text = get_dataset_detail_locale(lang)
# Get basic information
name = data_adapter.name
pretty_name = data_adapter.pretty_name or name
dataset_id = data_adapter.dataset_id
description = data_adapter.description or text['no_description']
# Format dataset ID links
if dataset_id.startswith(('http://', 'https://')):
dataset_id_md = f'[{dataset_id}]({dataset_id})'
elif '/' in dataset_id: # ModelScope format ID
dataset_id_md = f'[{dataset_id}](https://modelscope.cn/datasets/{dataset_id}/summary)'
else:
dataset_id_md = dataset_id
# Build details section
details = [
f'### {pretty_name}',
'',
f'[{text["back_to_top"]}](#{text["toc_title"].lower().replace(" ", "-")})',
f'- **{text["dataset_name"]}**: `{name}`',
f'- **{text["dataset_id"]}**: {dataset_id_md}',
f'- **{text["description"]}**: \n > {description}',
f'- **{text["task_categories"]}**: {wrap_key_words(data_adapter.tags)}',
f'- **{text["evaluation_metrics"]}**: {wrap_key_words(data_adapter.metric_list)}',
f'- **{text["requires_llm_judge"]}**: {text["yes"] if data_adapter.llm_as_a_judge else text["no"]}',
f'- **{text["default_shots"]}**: {data_adapter.few_shot_num}-shot'
]
# Add dataset subsets
if data_adapter.subset_list:
details.append(f'- **{text["subsets"]}**: {wrap_key_words(data_adapter.subset_list)}')
# Add technical information
technical_info = [
f'- **{text["supported_output_formats"]}**: {wrap_key_words(data_adapter.output_types)}',
]
# Add extra parameters
extra_params = data_adapter.config_kwargs.get('extra_params', {})
if extra_params:
technical_info.append(f'- **{text["extra_parameters"]}**: \n```json\n{process_dictionary(extra_params)}\n```')
# Add prompt templates
if data_adapter.system_prompt:
technical_info.append(f'- **{text["system_prompt"]}**: \n```text\n{data_adapter.system_prompt}\n```')
if data_adapter.prompt_template:
technical_info.append(f'- **{text["prompt_template"]}**: \n```text\n{data_adapter.prompt_template}\n```')
return '\n'.join(details + [''] + technical_info + [''])
def generate_full_documentation(adapters: list[DataAdapter], lang: str = 'zh') -> str:
"""
Generate complete Markdown documentation with index and all benchmark details
Args:
adapters (list[DataAdapter]): List of DataAdapter instances
lang (str): Language code ('zh' for Chinese, 'en' for English)
Returns:
str: Complete Markdown document
"""
# Get localized text
text = get_document_locale(lang)
# Generate index
index = [
f'# {text["title"]}',
'',
f'{text["intro"]}',
'',
f'| {text["dataset_name"]} | {text["pretty_name"]} | {text["task_categories"]} |',
'|------------|----------|----------|',
]
for adapter in adapters:
name = adapter.name
pretty_name = adapter.pretty_name or name
link_name = pretty_name.lower().replace(' ', '-').replace('.', '')
tags = wrap_key_words(adapter.tags)
index.append(f'| `{name}` | [{pretty_name}](#{link_name}) | {tags} |')
# Generate details section
details = [
'',
'---',
'',
f'## {text["details_title"]}',
''
]
for i, adapter in enumerate(adapters):
details.append(generate_dataset_markdown(adapter, lang))
if i < len(adapters) - 1:
details.append('---')
details.append('')
return '\n'.join(index + details)
if __name__ == '__main__':
# 示例用法
from evalscope.benchmarks.benchmark import BENCHMARK_MAPPINGS
aigc_benchmarks = ['evalmuse', 'genai_bench', 'general_t2i', 'hpdv2', 'tifa160', 'data_collection']
# 获取所有DataAdapter实例
adapters = []
for benchmark in tqdm(BENCHMARK_MAPPINGS.values()):
if benchmark.name not in aigc_benchmarks:
adapters.append(benchmark.get_data_adapter())
adapters.sort(key=lambda x: x.name) # 按名称排序
# 生成完整文档
markdown_doc = generate_full_documentation(adapters, 'zh')
markdown_doc_en = generate_full_documentation(adapters, 'en')
# 输出到文件
with open('docs/zh/get_started/supported_dataset/llm.md', 'w', encoding='utf-8') as f:
f.write(markdown_doc)
with open('docs/en/get_started/supported_dataset/llm.md', 'w', encoding='utf-8') as f:
f.write(markdown_doc_en)
print('Done')