126 lines
5.2 KiB
Python
126 lines
5.2 KiB
Python
|
||
def generate_collection():
|
||
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
||
from evalscope.utils.io_utils import dump_jsonl_data
|
||
|
||
schema = CollectionSchema(name='Qwen3', datasets=[
|
||
CollectionSchema(name='English', datasets=[
|
||
DatasetInfo(name='mmlu_pro', weight=1, task_type='exam', tags=['en'], args={'few_shot_num': 0}),
|
||
DatasetInfo(name='mmlu_redux', weight=1, task_type='exam', tags=['en'], args={'few_shot_num': 0}),
|
||
DatasetInfo(name='ifeval', weight=1, task_type='instruction', tags=['en'], args={'few_shot_num': 0}),
|
||
]),
|
||
CollectionSchema(name='Chinese', datasets=[
|
||
DatasetInfo(name='ceval', weight=1, task_type='exam', tags=['zh'], args={'few_shot_num': 0}),
|
||
DatasetInfo(name='iquiz', weight=1, task_type='exam', tags=['zh'], args={'few_shot_num': 0}),
|
||
]),
|
||
CollectionSchema(name='Code', datasets=[
|
||
DatasetInfo(name='live_code_bench', weight=1, task_type='code', tags=['en'], args={'few_shot_num': 0, 'subset_list': ['v5_v6'], 'extra_params': {'start_date': '2025-01-01', 'end_date': '2025-04-30'}}),
|
||
]),
|
||
CollectionSchema(name='Math&Science', datasets=[
|
||
DatasetInfo(name='math_500', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}),
|
||
DatasetInfo(name='aime24', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}),
|
||
DatasetInfo(name='aime25', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}),
|
||
DatasetInfo(name='gpqa', weight=1, task_type='knowledge', tags=['en'], args={'subset_list': ['gpqa_diamond'], 'few_shot_num': 0})
|
||
])
|
||
])
|
||
|
||
# get the mixed data
|
||
mixed_data = WeightedSampler(schema).sample(100000000) # set a large number to ensure all datasets are sampled
|
||
# dump the mixed data to a jsonl file
|
||
dump_jsonl_data(mixed_data, 'outputs/qwen3_test.jsonl')
|
||
|
||
def run_test_think():
|
||
from evalscope import TaskConfig, run_task
|
||
task_cfg = TaskConfig(
|
||
model='Qwen3-32B',
|
||
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
||
eval_type='service',
|
||
datasets=[
|
||
'data_collection',
|
||
],
|
||
dataset_args={
|
||
'data_collection': {
|
||
'dataset_id': 'modelscope/EvalScope-Qwen3-Test',
|
||
'filters': {'remove_until': '</think>'} # 过滤掉思考的内容
|
||
}
|
||
},
|
||
eval_batch_size=128,
|
||
generation_config={
|
||
'max_tokens': 30000, # 最大生成token数,建议设置为较大值避免输出截断
|
||
'temperature': 0.6, # 采样温度 (qwen 报告推荐值)
|
||
'top_p': 0.95, # top-p采样 (qwen 报告推荐值)
|
||
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
||
'n': 1, # 每个请求产生的回复数量
|
||
},
|
||
timeout=60000, # 超时时间
|
||
stream=True, # 是否使用流式输出
|
||
limit=100, # 设置为100条数据进行测试
|
||
)
|
||
|
||
run_task(task_cfg=task_cfg)
|
||
|
||
def run_test_no_think():
|
||
from evalscope import TaskConfig, run_task
|
||
task_cfg = TaskConfig(
|
||
model='Qwen3-32B-no-think',
|
||
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
||
eval_type='service',
|
||
datasets=[
|
||
'data_collection',
|
||
],
|
||
dataset_args={
|
||
'data_collection': {
|
||
'dataset_id': 'modelscope/EvalScope-Qwen3-Test',
|
||
}
|
||
},
|
||
eval_batch_size=128,
|
||
generation_config={
|
||
'max_tokens': 10000, # 最大生成token数,建议设置为较大值避免输出截断
|
||
'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
|
||
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
|
||
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
||
'n': 1, # 每个请求产生的回复数量
|
||
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
|
||
},
|
||
judge_worker_num=1,
|
||
timeout=60000, # 超时时间
|
||
stream=True, # 是否使用流式输出
|
||
limit=10, # 设置为1000条数据进行测试
|
||
)
|
||
|
||
run_task(task_cfg=task_cfg)
|
||
|
||
def run_math_thinking():
|
||
from evalscope import TaskConfig, run_task
|
||
task_cfg = TaskConfig(
|
||
model='Qwen3-32B',
|
||
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
||
eval_type='service',
|
||
datasets=[
|
||
'math_500',
|
||
],
|
||
dataset_args={
|
||
'math_500': {
|
||
'filters': {'remove_until': '</think>'}
|
||
}
|
||
},
|
||
eval_batch_size=128,
|
||
generation_config={
|
||
'max_tokens': 30000, # 最大生成token数,建议设置为较大值避免输出截断
|
||
'temperature': 0.6, # 采样温度 (qwen 报告推荐值)
|
||
'top_p': 0.95, # top-p采样 (qwen 报告推荐值)
|
||
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
||
'n': 1, # 每个请求产生的回复数量
|
||
},
|
||
timeout=60000,
|
||
stream=True
|
||
# use_cache='outputs/20250427_234222'
|
||
)
|
||
run_task(task_cfg=task_cfg)
|
||
|
||
if __name__ == '__main__':
|
||
# generate_collection()
|
||
# run_test_think()
|
||
# run_math_thinking()
|
||
run_test_no_think()
|