157 lines
4.0 KiB
Python
157 lines
4.0 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
from dotenv import dotenv_values
|
|
|
|
env = dotenv_values('.env')
|
|
|
|
import os
|
|
import unittest
|
|
|
|
from evalscope.config import TaskConfig
|
|
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
from evalscope.run import run_task
|
|
from evalscope.utils import test_level_list
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
|
|
logger = get_logger()
|
|
|
|
datasets=[
|
|
'iquiz',
|
|
'ifeval',
|
|
'mmlu',
|
|
'mmlu_pro',
|
|
'musr',
|
|
'process_bench',
|
|
'race',
|
|
'trivia_qa',
|
|
'cmmlu',
|
|
'humaneval',
|
|
'gsm8k',
|
|
'bbh',
|
|
'competition_math',
|
|
'math_500',
|
|
'aime24',
|
|
'gpqa',
|
|
'arc',
|
|
'ceval',
|
|
'hellaswag',
|
|
'general_mcq',
|
|
'general_qa',
|
|
'super_gpqa',
|
|
'live_code_bench',
|
|
'mmlu_redux',
|
|
'simple_qa',
|
|
'chinese_simpleqa',
|
|
'alpaca_eval',
|
|
'arena_hard',
|
|
'maritime_bench',
|
|
'drop',
|
|
'winogrande',
|
|
'tool_bench',
|
|
]
|
|
|
|
dataset_args={
|
|
'mmlu': {
|
|
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
'few_shot_num': 0
|
|
},
|
|
'mmlu_pro': {
|
|
'subset_list': ['math', 'health'],
|
|
'few_shot_num': 4
|
|
},
|
|
'ceval': {
|
|
'subset_list': [
|
|
'computer_network', 'operating_system', 'computer_architecture'
|
|
],
|
|
'few_shot_num': 0
|
|
},
|
|
'cmmlu': {
|
|
'subset_list': ['elementary_chinese'],
|
|
'few_shot_num': 0
|
|
},
|
|
'bbh': {
|
|
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
},
|
|
'gpqa': {
|
|
'subset_list': ['gpqa_diamond'],
|
|
'few_shot_num': 0,
|
|
},
|
|
'humaneval': {
|
|
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
},
|
|
'competition_math': {
|
|
'subset_list': ['Level 1']
|
|
},
|
|
'math_500': {
|
|
'subset_list': ['Level 1']
|
|
},
|
|
'process_bench': {
|
|
'subset_list': ['gsm8k'],
|
|
},
|
|
'musr': {
|
|
'subset_list': ['murder_mysteries']
|
|
},
|
|
'general_mcq': {
|
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
'subset_list': [
|
|
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
],
|
|
},
|
|
'general_qa': {
|
|
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
'subset_list': [
|
|
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
# 'test'
|
|
],
|
|
'metric_list': ['AverageBLEU']
|
|
},
|
|
'super_gpqa': {
|
|
'subset_list': ['Philosophy', 'Education'],
|
|
'few_shot_num': 0
|
|
},
|
|
'live_code_bench': {
|
|
'subset_list': ['v4_v5'],
|
|
'extra_params': {
|
|
'start_date': '2024-12-01',
|
|
'end_date': '2025-01-01'
|
|
},
|
|
},
|
|
'chinese_simpleqa': {
|
|
'subset_list': ['中华文化']
|
|
},
|
|
'mmlu_redux':{
|
|
'subset_list': ['abstract_algebra']
|
|
},
|
|
}
|
|
|
|
class TestRun(unittest.TestCase):
|
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
def test_benchmarks(self):
|
|
from evalscope.config import TaskConfig
|
|
|
|
task_cfg = TaskConfig(
|
|
model='qwen2.5-0.5b-instruct',
|
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
eval_type=EvalType.SERVICE,
|
|
datasets=datasets,
|
|
dataset_args=dataset_args,
|
|
eval_batch_size=1,
|
|
limit=1,
|
|
stream=True,
|
|
generation_config={
|
|
'temperature': 0,
|
|
'n': 1,
|
|
'max_tokens': 4096,
|
|
},
|
|
judge_strategy=JudgeStrategy.AUTO,
|
|
judge_model_args={
|
|
'model_id': 'qwen2.5-7b-instruct',
|
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
}
|
|
)
|
|
|
|
run_task(task_cfg=task_cfg)
|