evalscope/tests/cli/test_all.py

157 lines
4.0 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
from dotenv import dotenv_values
env = dotenv_values('.env')
import os
import unittest
from evalscope.config import TaskConfig
from evalscope.constants import EvalType, JudgeStrategy, OutputType
from evalscope.run import run_task
from evalscope.utils import test_level_list
from evalscope.utils.logger import get_logger
os.environ['LOG_LEVEL'] = 'DEBUG'
logger = get_logger()
datasets=[
'iquiz',
'ifeval',
'mmlu',
'mmlu_pro',
'musr',
'process_bench',
'race',
'trivia_qa',
'cmmlu',
'humaneval',
'gsm8k',
'bbh',
'competition_math',
'math_500',
'aime24',
'gpqa',
'arc',
'ceval',
'hellaswag',
'general_mcq',
'general_qa',
'super_gpqa',
'live_code_bench',
'mmlu_redux',
'simple_qa',
'chinese_simpleqa',
'alpaca_eval',
'arena_hard',
'maritime_bench',
'drop',
'winogrande',
'tool_bench',
]
dataset_args={
'mmlu': {
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
'few_shot_num': 0
},
'mmlu_pro': {
'subset_list': ['math', 'health'],
'few_shot_num': 4
},
'ceval': {
'subset_list': [
'computer_network', 'operating_system', 'computer_architecture'
],
'few_shot_num': 0
},
'cmmlu': {
'subset_list': ['elementary_chinese'],
'few_shot_num': 0
},
'bbh': {
'subset_list': ['word_sorting', 'movie_recommendation'],
},
'gpqa': {
'subset_list': ['gpqa_diamond'],
'few_shot_num': 0,
},
'humaneval': {
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
},
'competition_math': {
'subset_list': ['Level 1']
},
'math_500': {
'subset_list': ['Level 1']
},
'process_bench': {
'subset_list': ['gsm8k'],
},
'musr': {
'subset_list': ['murder_mysteries']
},
'general_mcq': {
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
'subset_list': [
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
],
},
'general_qa': {
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
'subset_list': [
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
# 'test'
],
'metric_list': ['AverageBLEU']
},
'super_gpqa': {
'subset_list': ['Philosophy', 'Education'],
'few_shot_num': 0
},
'live_code_bench': {
'subset_list': ['v4_v5'],
'extra_params': {
'start_date': '2024-12-01',
'end_date': '2025-01-01'
},
},
'chinese_simpleqa': {
'subset_list': ['中华文化']
},
'mmlu_redux':{
'subset_list': ['abstract_algebra']
},
}
class TestRun(unittest.TestCase):
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_benchmarks(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='qwen2.5-0.5b-instruct',
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
api_key= env.get('DASHSCOPE_API_KEY'),
eval_type=EvalType.SERVICE,
datasets=datasets,
dataset_args=dataset_args,
eval_batch_size=1,
limit=1,
stream=True,
generation_config={
'temperature': 0,
'n': 1,
'max_tokens': 4096,
},
judge_strategy=JudgeStrategy.AUTO,
judge_model_args={
'model_id': 'qwen2.5-7b-instruct',
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
'api_key': env.get('DASHSCOPE_API_KEY'),
}
)
run_task(task_cfg=task_cfg)