def generate_collection(): from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler from evalscope.utils.io_utils import dump_jsonl_data schema = CollectionSchema(name='Qwen3', datasets=[ CollectionSchema(name='English', datasets=[ DatasetInfo(name='mmlu_pro', weight=1, task_type='exam', tags=['en'], args={'few_shot_num': 0}), DatasetInfo(name='mmlu_redux', weight=1, task_type='exam', tags=['en'], args={'few_shot_num': 0}), DatasetInfo(name='ifeval', weight=1, task_type='instruction', tags=['en'], args={'few_shot_num': 0}), ]), CollectionSchema(name='Chinese', datasets=[ DatasetInfo(name='ceval', weight=1, task_type='exam', tags=['zh'], args={'few_shot_num': 0}), DatasetInfo(name='iquiz', weight=1, task_type='exam', tags=['zh'], args={'few_shot_num': 0}), ]), CollectionSchema(name='Code', datasets=[ DatasetInfo(name='live_code_bench', weight=1, task_type='code', tags=['en'], args={'few_shot_num': 0, 'subset_list': ['v5_v6'], 'extra_params': {'start_date': '2025-01-01', 'end_date': '2025-04-30'}}), ]), CollectionSchema(name='Math&Science', datasets=[ DatasetInfo(name='math_500', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}), DatasetInfo(name='aime24', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}), DatasetInfo(name='aime25', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}), DatasetInfo(name='gpqa', weight=1, task_type='knowledge', tags=['en'], args={'subset_list': ['gpqa_diamond'], 'few_shot_num': 0}) ]) ]) # get the mixed data mixed_data = WeightedSampler(schema).sample(100000000) # set a large number to ensure all datasets are sampled # dump the mixed data to a jsonl file dump_jsonl_data(mixed_data, 'outputs/qwen3_test.jsonl') def run_test_think(): from evalscope import TaskConfig, run_task task_cfg = TaskConfig( model='Qwen3-32B', api_url='http://127.0.0.1:8801/v1/chat/completions', eval_type='service', datasets=[ 'data_collection', ], dataset_args={ 'data_collection': { 'dataset_id': 'modelscope/EvalScope-Qwen3-Test', 'filters': {'remove_until': ''} # 过滤掉思考的内容 } }, eval_batch_size=128, generation_config={ 'max_tokens': 30000, # 最大生成token数,建议设置为较大值避免输出截断 'temperature': 0.6, # 采样温度 (qwen 报告推荐值) 'top_p': 0.95, # top-p采样 (qwen 报告推荐值) 'top_k': 20, # top-k采样 (qwen 报告推荐值) 'n': 1, # 每个请求产生的回复数量 }, timeout=60000, # 超时时间 stream=True, # 是否使用流式输出 limit=100, # 设置为100条数据进行测试 ) run_task(task_cfg=task_cfg) def run_test_no_think(): from evalscope import TaskConfig, run_task task_cfg = TaskConfig( model='Qwen3-32B-no-think', api_url='http://127.0.0.1:8801/v1/chat/completions', eval_type='service', datasets=[ 'data_collection', ], dataset_args={ 'data_collection': { 'dataset_id': 'modelscope/EvalScope-Qwen3-Test', } }, eval_batch_size=128, generation_config={ 'max_tokens': 10000, # 最大生成token数,建议设置为较大值避免输出截断 'temperature': 0.7, # 采样温度 (qwen 报告推荐值) 'top_p': 0.8, # top-p采样 (qwen 报告推荐值) 'top_k': 20, # top-k采样 (qwen 报告推荐值) 'n': 1, # 每个请求产生的回复数量 'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式 }, judge_worker_num=1, timeout=60000, # 超时时间 stream=True, # 是否使用流式输出 limit=10, # 设置为1000条数据进行测试 ) run_task(task_cfg=task_cfg) def run_math_thinking(): from evalscope import TaskConfig, run_task task_cfg = TaskConfig( model='Qwen3-32B', api_url='http://127.0.0.1:8801/v1/chat/completions', eval_type='service', datasets=[ 'math_500', ], dataset_args={ 'math_500': { 'filters': {'remove_until': ''} } }, eval_batch_size=128, generation_config={ 'max_tokens': 30000, # 最大生成token数,建议设置为较大值避免输出截断 'temperature': 0.6, # 采样温度 (qwen 报告推荐值) 'top_p': 0.95, # top-p采样 (qwen 报告推荐值) 'top_k': 20, # top-k采样 (qwen 报告推荐值) 'n': 1, # 每个请求产生的回复数量 }, timeout=60000, stream=True # use_cache='outputs/20250427_234222' ) run_task(task_cfg=task_cfg) if __name__ == '__main__': # generate_collection() # run_test_think() # run_math_thinking() run_test_no_think()