evalscope_v0.17.0/evalscope.0.17.0/examples/example_deepseek_distill_co...

45 lines
1.6 KiB
Python

from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
from evalscope.utils.io_utils import dump_jsonl_data
# define the schema
schema = CollectionSchema(name='DeepSeekDistill', datasets=[
CollectionSchema(name='Math', datasets=[
DatasetInfo(name='math_500', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}),
DatasetInfo(name='gpqa', weight=1, task_type='math', tags=['en'], args={'subset_list': ['gpqa_diamond'], 'few_shot_num': 0}),
DatasetInfo(name='aime24', weight=1, task_type='math', tags=['en'], args={'few_shot_num': 0}),
])
])
# get the mixed data
mixed_data = WeightedSampler(schema).sample(100000) # set a large number to ensure all datasets are sampled
# dump the mixed data to a jsonl file
dump_jsonl_data(mixed_data, 'outputs/deepseek_distill_test.jsonl')
from evalscope import TaskConfig, run_task
# start the task
from evalscope.constants import EvalType
task_cfg = TaskConfig(
model='DeepSeek-R1-Distill-Qwen-1.5B',
api_url='http://127.0.0.1:8801/v1/chat/completions',
api_key='EMPTY',
eval_type=EvalType.SERVICE,
datasets=[
'data_collection',
],
dataset_args={
'data_collection': {
'local_path': 'outputs/deepseek_distill_test.jsonl'
}
},
eval_batch_size=32,
generation_config={
'max_tokens': 30000, # avoid exceed max length
'temperature': 0.6,
'top_p': 0.95,
'n': 1, # num of responses per sample, note that lmdeploy only supports n=1
},
)
run_task(task_cfg=task_cfg)