import json import os import unittest from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler from evalscope.constants import EvalType, JudgeStrategy from evalscope.utils.io_utils import dump_jsonl_data from evalscope.utils.utils import test_level_list class TestCollection(unittest.TestCase): @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_create_collection(self): schema = CollectionSchema(name='math&reasoning', datasets=[ CollectionSchema(name='math', datasets=[ CollectionSchema(name='generation', datasets=[ DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']), DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']), ]), CollectionSchema(name='multiple_choice', datasets=[ DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}), DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}), ]), ]), CollectionSchema(name='reasoning', datasets=[ DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']), DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}), DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']), ]), ]) print(schema.to_dict()) print(schema.flatten()) schema.dump_json('outputs/schema_test.json') @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_generate_data(self): schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r'))) print(schema.to_dict()) mixed_data = WeightedSampler(schema).sample(100) dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl') @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_evaluate_collection(self): from evalscope import TaskConfig, run_task task_cfg = TaskConfig( model='Qwen2.5-0.5B-Instruct', api_url='http://127.0.0.1:8801/v1/chat/completions', api_key='EMPTY', eval_type=EvalType.SERVICE, datasets=['data_collection'], dataset_args={'data_collection': { 'local_path': 'outputs/mixed_data_test.jsonl' # 'local_path': 'outputs/weighted_mixed_data.jsonl' }}, ) run_task(task_cfg=task_cfg) @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_evaluate_collection_with_judge(self): from evalscope import TaskConfig, run_task task_cfg = TaskConfig( model='qwen2.5-7b-instruct', api_url='https://dashscope.aliyuncs.com/compatible-mode/v1', api_key= os.getenv('DASHSCOPE_API_KEY'), eval_type=EvalType.SERVICE, datasets=['data_collection'], dataset_args={'data_collection': { 'local_path': 'outputs/mixed_data_test.jsonl' # 'local_path': 'outputs/weighted_mixed_data.jsonl' }}, limit=10, judge_strategy=JudgeStrategy.LLM_RECALL, judge_model_args={ 'model_id': 'qwen2.5-7b-instruct', 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1', 'api_key': os.getenv('DASHSCOPE_API_KEY'), }, use_cache='outputs/20250519_114427' ) res = run_task(task_cfg=task_cfg) print(res)