272 lines
8.3 KiB
Python
272 lines
8.3 KiB
Python
import json
|
||
from tqdm import tqdm
|
||
from typing import Any, Dict
|
||
|
||
from evalscope.benchmarks import DataAdapter
|
||
|
||
# Language dictionaries for dataset markdown generation
|
||
DATASET_DETAIL_LOCALE = {
|
||
'back_to_top': {
|
||
'zh': '返回目录',
|
||
'en': 'Back to Top'
|
||
},
|
||
'toc_title': {
|
||
'zh': 'LLM评测集',
|
||
'en': 'LLM Benchmarks'
|
||
},
|
||
'dataset_name': {
|
||
'zh': '数据集名称',
|
||
'en': 'Dataset Name'
|
||
},
|
||
'dataset_id': {
|
||
'zh': '数据集ID',
|
||
'en': 'Dataset ID'
|
||
},
|
||
'description': {
|
||
'zh': '数据集描述',
|
||
'en': 'Description'
|
||
},
|
||
'task_categories': {
|
||
'zh': '任务类别',
|
||
'en': 'Task Categories'
|
||
},
|
||
'evaluation_metrics': {
|
||
'zh': '评估指标',
|
||
'en': 'Evaluation Metrics'
|
||
},
|
||
'requires_llm_judge': {
|
||
'zh': '需要LLM Judge',
|
||
'en': 'Requires LLM Judge'
|
||
},
|
||
'default_shots': {
|
||
'zh': '默认提示方式',
|
||
'en': 'Default Shots'
|
||
},
|
||
'subsets': {
|
||
'zh': '数据集子集',
|
||
'en': 'Subsets'
|
||
},
|
||
'supported_output_formats': {
|
||
'zh': '支持输出格式',
|
||
'en': 'Supported Output Formats'
|
||
},
|
||
'extra_parameters': {
|
||
'zh': '额外参数',
|
||
'en': 'Extra Parameters'
|
||
},
|
||
'system_prompt': {
|
||
'zh': '系统提示词',
|
||
'en': 'System Prompt'
|
||
},
|
||
'prompt_template': {
|
||
'zh': '提示模板',
|
||
'en': 'Prompt Template'
|
||
},
|
||
'yes': {
|
||
'zh': '是',
|
||
'en': 'Yes'
|
||
},
|
||
'no': {
|
||
'zh': '否',
|
||
'en': 'No'
|
||
},
|
||
'no_description': {
|
||
'zh': '暂无详细描述',
|
||
'en': 'No detailed description available'
|
||
}
|
||
}
|
||
|
||
DOCUMENT_LOCALE = {
|
||
'title': {
|
||
'zh': 'LLM评测集',
|
||
'en': 'LLM Benchmarks'
|
||
},
|
||
'intro': {
|
||
'zh': '以下是支持的LLM评测集列表,点击数据集标准名称可跳转详细信息。',
|
||
'en': 'Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.'
|
||
},
|
||
'dataset_name': {
|
||
'zh': '数据集名称',
|
||
'en': 'Benchmark Name'
|
||
},
|
||
'pretty_name': {
|
||
'zh': '标准名称',
|
||
'en': 'Pretty Name'
|
||
},
|
||
'task_categories': {
|
||
'zh': '任务类别',
|
||
'en': 'Task Categories'
|
||
},
|
||
'details_title': {
|
||
'zh': '数据集详情',
|
||
'en': 'Benchmark Details'
|
||
}
|
||
}
|
||
|
||
def wrap_key_words(keywords: list[str]) -> str:
|
||
"""
|
||
将关键词列表转换为Markdown格式的字符串
|
||
|
||
Args:
|
||
keywords (list[str]): 关键词列表
|
||
|
||
Returns:
|
||
str: 格式化的Markdown字符串
|
||
"""
|
||
|
||
# 使用逗号分隔关键词,并添加反引号格式化
|
||
return ', '.join(sorted([f'`{keyword}`' for keyword in keywords]))
|
||
|
||
def process_dictionary(data: dict) -> str:
|
||
"""
|
||
json.dumps的包装函数,处理字典格式化为Markdown代码块
|
||
Args:
|
||
data (dict): 要格式化的字典
|
||
"""
|
||
return json.dumps(data, ensure_ascii=False, indent=4)
|
||
|
||
def get_dataset_detail_locale(lang: str) -> Dict[str, str]:
|
||
"""Get localized strings for dataset details"""
|
||
return {k: v[lang] for k, v in DATASET_DETAIL_LOCALE.items()}
|
||
|
||
def get_document_locale(lang: str) -> Dict[str, str]:
|
||
"""Get localized strings for document structure"""
|
||
return {k: v[lang] for k, v in DOCUMENT_LOCALE.items()}
|
||
|
||
def generate_dataset_markdown(data_adapter: DataAdapter, lang: str = 'zh') -> str:
|
||
"""
|
||
Generate a well-formatted Markdown benchmark introduction based on a DataAdapter instance
|
||
|
||
Args:
|
||
data_adapter (DataAdapter): Dataset adapter instance
|
||
lang (str): Language code ('zh' for Chinese, 'en' for English)
|
||
|
||
Returns:
|
||
str: Formatted Markdown string
|
||
"""
|
||
# Get localized text
|
||
text = get_dataset_detail_locale(lang)
|
||
|
||
# Get basic information
|
||
name = data_adapter.name
|
||
pretty_name = data_adapter.pretty_name or name
|
||
dataset_id = data_adapter.dataset_id
|
||
description = data_adapter.description or text['no_description']
|
||
|
||
# Format dataset ID links
|
||
if dataset_id.startswith(('http://', 'https://')):
|
||
dataset_id_md = f'[{dataset_id}]({dataset_id})'
|
||
elif '/' in dataset_id: # ModelScope format ID
|
||
dataset_id_md = f'[{dataset_id}](https://modelscope.cn/datasets/{dataset_id}/summary)'
|
||
else:
|
||
dataset_id_md = dataset_id
|
||
|
||
# Build details section
|
||
details = [
|
||
f'### {pretty_name}',
|
||
'',
|
||
f'[{text["back_to_top"]}](#{text["toc_title"].lower().replace(" ", "-")})',
|
||
f'- **{text["dataset_name"]}**: `{name}`',
|
||
f'- **{text["dataset_id"]}**: {dataset_id_md}',
|
||
f'- **{text["description"]}**: \n > {description}',
|
||
f'- **{text["task_categories"]}**: {wrap_key_words(data_adapter.tags)}',
|
||
f'- **{text["evaluation_metrics"]}**: {wrap_key_words(data_adapter.metric_list)}',
|
||
f'- **{text["requires_llm_judge"]}**: {text["yes"] if data_adapter.llm_as_a_judge else text["no"]}',
|
||
f'- **{text["default_shots"]}**: {data_adapter.few_shot_num}-shot'
|
||
]
|
||
|
||
# Add dataset subsets
|
||
if data_adapter.subset_list:
|
||
details.append(f'- **{text["subsets"]}**: {wrap_key_words(data_adapter.subset_list)}')
|
||
|
||
# Add technical information
|
||
technical_info = [
|
||
f'- **{text["supported_output_formats"]}**: {wrap_key_words(data_adapter.output_types)}',
|
||
]
|
||
|
||
# Add extra parameters
|
||
extra_params = data_adapter.config_kwargs.get('extra_params', {})
|
||
if extra_params:
|
||
technical_info.append(f'- **{text["extra_parameters"]}**: \n```json\n{process_dictionary(extra_params)}\n```')
|
||
|
||
# Add prompt templates
|
||
if data_adapter.system_prompt:
|
||
technical_info.append(f'- **{text["system_prompt"]}**: \n```text\n{data_adapter.system_prompt}\n```')
|
||
if data_adapter.prompt_template:
|
||
technical_info.append(f'- **{text["prompt_template"]}**: \n```text\n{data_adapter.prompt_template}\n```')
|
||
|
||
return '\n'.join(details + [''] + technical_info + [''])
|
||
|
||
def generate_full_documentation(adapters: list[DataAdapter], lang: str = 'zh') -> str:
|
||
"""
|
||
Generate complete Markdown documentation with index and all benchmark details
|
||
|
||
Args:
|
||
adapters (list[DataAdapter]): List of DataAdapter instances
|
||
lang (str): Language code ('zh' for Chinese, 'en' for English)
|
||
|
||
Returns:
|
||
str: Complete Markdown document
|
||
"""
|
||
# Get localized text
|
||
text = get_document_locale(lang)
|
||
|
||
# Generate index
|
||
index = [
|
||
f'# {text["title"]}',
|
||
'',
|
||
f'{text["intro"]}',
|
||
'',
|
||
f'| {text["dataset_name"]} | {text["pretty_name"]} | {text["task_categories"]} |',
|
||
'|------------|----------|----------|',
|
||
]
|
||
|
||
for adapter in adapters:
|
||
name = adapter.name
|
||
pretty_name = adapter.pretty_name or name
|
||
link_name = pretty_name.lower().replace(' ', '-').replace('.', '')
|
||
tags = wrap_key_words(adapter.tags)
|
||
index.append(f'| `{name}` | [{pretty_name}](#{link_name}) | {tags} |')
|
||
|
||
# Generate details section
|
||
details = [
|
||
'',
|
||
'---',
|
||
'',
|
||
f'## {text["details_title"]}',
|
||
''
|
||
]
|
||
|
||
for i, adapter in enumerate(adapters):
|
||
details.append(generate_dataset_markdown(adapter, lang))
|
||
if i < len(adapters) - 1:
|
||
details.append('---')
|
||
details.append('')
|
||
|
||
return '\n'.join(index + details)
|
||
|
||
if __name__ == '__main__':
|
||
# 示例用法
|
||
from evalscope.benchmarks.benchmark import BENCHMARK_MAPPINGS
|
||
|
||
aigc_benchmarks = ['evalmuse', 'genai_bench', 'general_t2i', 'hpdv2', 'tifa160', 'data_collection']
|
||
# 获取所有DataAdapter实例
|
||
adapters = []
|
||
for benchmark in tqdm(BENCHMARK_MAPPINGS.values()):
|
||
if benchmark.name not in aigc_benchmarks:
|
||
adapters.append(benchmark.get_data_adapter())
|
||
|
||
adapters.sort(key=lambda x: x.name) # 按名称排序
|
||
|
||
# 生成完整文档
|
||
markdown_doc = generate_full_documentation(adapters, 'zh')
|
||
markdown_doc_en = generate_full_documentation(adapters, 'en')
|
||
|
||
# 输出到文件
|
||
with open('docs/zh/get_started/supported_dataset/llm.md', 'w', encoding='utf-8') as f:
|
||
f.write(markdown_doc)
|
||
|
||
with open('docs/en/get_started/supported_dataset/llm.md', 'w', encoding='utf-8') as f:
|
||
f.write(markdown_doc_en)
|
||
|
||
print('Done') |