# Copyright (c) Alibaba, Inc. and its affiliates. from dotenv import dotenv_values from tests.utils import test_level_list env = dotenv_values('.env') import os import subprocess import unittest from evalscope.config import TaskConfig from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType from evalscope.run import run_task from evalscope.utils.import_utils import is_module_installed from evalscope.utils.logger import get_logger os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG' logger = get_logger() class TestRunCustom(unittest.TestCase): @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_custom_task(self): from evalscope.config import TaskConfig task_cfg = TaskConfig( model='Qwen/Qwen3-0.6B', datasets=[ 'general_mcq', 'general_qa' ], dataset_args={ 'general_mcq': { 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径 'subset_list': [ 'example' # 评测数据集名称,上述 *_dev.csv 中的 * ], 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板 }, 'general_qa': { 'local_path': 'custom_eval/text/qa', # 自定义数据集路径 'subset_list': [ 'example' # 评测数据集名称,上述 *_dev.csv 中的 * ] } }, ) res = run_task(task_cfg=task_cfg) print(res) @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_local_dataset(self): from evalscope.config import TaskConfig task_cfg = TaskConfig( model='qwen-plus', api_url='https://dashscope.aliyuncs.com/compatible-mode/v1', api_key= env.get('DASHSCOPE_API_KEY'), eval_type=EvalType.SERVICE, datasets=[ # 'mmlu', # 'race', 'trivia_qa', # 'cmmlu', # 'humaneval', # 'gsm8k', # 'bbh', # 'competition_math', # 'arc', # 'ceval', ], dataset_args={ 'mmlu': { 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'], 'few_shot_num': 0, 'dataset_id': 'data/data/mmlu', }, 'ceval': { 'subset_list': [ 'computer_network', 'operating_system', 'computer_architecture' ], 'few_shot_num': 0, 'dataset_id': 'data/data/ceval', }, 'cmmlu': { 'subset_list': ['elementary_chinese'], 'dataset_id': 'data/data/cmmlu', 'few_shot_num': 0 }, 'bbh': { 'subset_list': ['word_sorting', 'movie_recommendation'], }, 'humaneval': { 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'], }, 'trivia_qa': { 'dataset_id': 'data/data/trivia_qa', }, }, eval_batch_size=10, limit=5, debug=True, stream=True, generation_config={ 'temperature': 0, 'n': 1, 'max_tokens': 4096, }, ignore_errors=False, ) run_task(task_cfg=task_cfg) @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_general_no_answer(self): from evalscope.config import TaskConfig task_cfg = TaskConfig( model='qwen2.5-72b-instruct', api_url='https://dashscope.aliyuncs.com/compatible-mode/v1', api_key= env.get('DASHSCOPE_API_KEY'), eval_type=EvalType.SERVICE, datasets=[ 'general_qa', ], dataset_args={ 'general_qa': { 'dataset_id': 'custom_eval/text/qa', 'subset_list': [ 'arena', 'example' ], } }, eval_batch_size=10, limit=10, debug=True, stream=True, generation_config={ 'temperature': 0, 'n': 1, 'max_tokens': 4096, }, ignore_errors=False, judge_model_args={ 'model_id': 'qwen2.5-72b-instruct', 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1', 'api_key': env.get('DASHSCOPE_API_KEY'), 'generation_config': { 'temperature': 0.0, 'max_tokens': 4096 }, 'score_type': 'numeric', }, judge_worker_num=5, judge_strategy=JudgeStrategy.AUTO, ) run_task(task_cfg=task_cfg) @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_general_with_answer(self): from evalscope.config import TaskConfig task_cfg = TaskConfig( model='qwen-plus', api_url='https://dashscope.aliyuncs.com/compatible-mode/v1', api_key= env.get('DASHSCOPE_API_KEY'), eval_type=EvalType.SERVICE, datasets=[ 'general_qa', ], dataset_args={ 'general_qa': { 'dataset_id': 'custom_eval/text/qa', 'subset_list': [ 'example' ], } }, eval_batch_size=10, limit=10, debug=True, stream=True, generation_config={ 'temperature': 0, 'n': 1, 'max_tokens': 4096, }, ignore_errors=False, judge_model_args={ 'model_id': 'qwen2.5-72b-instruct', 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1', 'api_key': env.get('DASHSCOPE_API_KEY'), 'generation_config': { 'temperature': 0.0, 'max_tokens': 4096 }, 'score_type': 'pattern', }, judge_worker_num=5, judge_strategy=JudgeStrategy.LLM, ) run_task(task_cfg=task_cfg) @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_general_arena(self): from evalscope.config import TaskConfig task_cfg = TaskConfig( model_id='Arena', datasets=[ 'general_arena', ], dataset_args={ 'general_arena': { 'extra_params':{ 'models':[ { 'name': 'qwen2.5-0.5b', 'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct' }, { 'name': 'qwen2.5-7b', 'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct' }, { 'name': 'qwen2.5-72b', 'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct' } ], 'baseline': 'qwen2.5-7b' } } }, eval_batch_size=10, limit=10, debug=True, stream=True, ignore_errors=False, judge_model_args={ 'model_id': 'qwen-plus', 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1', 'api_key': env.get('DASHSCOPE_API_KEY'), 'generation_config': { 'temperature': 0.0, 'max_tokens': 8000 }, }, judge_worker_num=5, use_cache='outputs/20250702_165727' ) run_task(task_cfg=task_cfg)