131 lines
3.9 KiB
Python
131 lines
3.9 KiB
Python
import os
|
|
|
|
from evalscope import TaskConfig, run_task
|
|
|
|
DASHSCOPE_API_KEY = 'sk-723135c241x'
|
|
|
|
def eval_distill_qwen():
|
|
model_name = 'DeepSeek-R1-Distill-Qwen-7B'
|
|
dataset_name = 'math_500'
|
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
|
|
task_config = TaskConfig(
|
|
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
model=model_name,
|
|
eval_type='service',
|
|
datasets=[dataset_name],
|
|
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
eval_batch_size=32,
|
|
generation_config={
|
|
'max_tokens': 20000, # avoid exceed max length
|
|
'temperature': 0.6,
|
|
'top_p': 0.95,
|
|
'n': 1,
|
|
},
|
|
)
|
|
run_task(task_config)
|
|
|
|
|
|
def eval_math_qwen():
|
|
model_name = 'Qwen2.5-Math-7B-Instruct'
|
|
dataset_name = 'math_500'
|
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
|
|
task_config = TaskConfig(
|
|
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
model=model_name,
|
|
eval_type='service',
|
|
datasets=[dataset_name],
|
|
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
eval_batch_size=32,
|
|
generation_config={
|
|
'max_tokens': 3000, # avoid exceed max length
|
|
'temperature': 0.6,
|
|
'top_p': 0.95,
|
|
'n': 3,
|
|
},
|
|
)
|
|
run_task(task_config)
|
|
|
|
def eval_r1():
|
|
model_name = 'deepseek-r1'
|
|
dataset_name = 'math_500'
|
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
|
|
task_config = TaskConfig(
|
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
api_key=DASHSCOPE_API_KEY,
|
|
model=model_name,
|
|
eval_type='service',
|
|
datasets=[dataset_name],
|
|
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
eval_batch_size=8,
|
|
generation_config={
|
|
'max_tokens': 20000, # avoid exceed max length
|
|
'temperature': 0.6,
|
|
'top_p': 0.95,
|
|
'n': 1,
|
|
},
|
|
use_cache='./outputs/20250307_000404',
|
|
timeout=36000,
|
|
stream=True
|
|
)
|
|
run_task(task_config)
|
|
|
|
|
|
def eval_distill_32b():
|
|
model_name = 'deepseek-r1-distill-qwen-32b'
|
|
dataset_name = 'math_500'
|
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
|
|
task_config = TaskConfig(
|
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
api_key=DASHSCOPE_API_KEY,
|
|
model=model_name,
|
|
eval_type='service',
|
|
datasets=[dataset_name],
|
|
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
eval_batch_size=5,
|
|
generation_config={
|
|
'max_tokens': 12000, # avoid exceed max length
|
|
'temperature': 0.6,
|
|
'top_p': 0.95,
|
|
'n': 1,
|
|
},
|
|
use_cache='./outputs/20250306_235951',
|
|
timeout=32000,
|
|
stream=True
|
|
|
|
)
|
|
run_task(task_config)
|
|
|
|
def eval_qwq():
|
|
model_name = 'qwq-32b-preview'
|
|
dataset_name = 'math_500'
|
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
|
|
task_config = TaskConfig(
|
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
api_key=os.environ['DASHSCOPE_API_KEY'],
|
|
model=model_name,
|
|
eval_type='service',
|
|
datasets=[dataset_name],
|
|
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
eval_batch_size=32,
|
|
generation_config={
|
|
'max_tokens': 8000, # avoid exceed max length
|
|
'temperature': 0.6,
|
|
'top_p': 0.95,
|
|
'n': 1,
|
|
},
|
|
use_cache='./outputs/20250221_105911'
|
|
)
|
|
run_task(task_config)
|
|
|
|
if __name__ == '__main__':
|
|
# eval_distill_qwen()
|
|
# eval_math_qwen()
|
|
eval_r1()
|
|
# eval_qwq()
|
|
# eval_distill_32b()
|