evalscope/tests/cli/test_run.py

# Copyright (c) Alibaba, Inc. and its affiliates.
from dotenv import dotenv_values

env = dotenv_values('.env')

import os
import subprocess
import unittest

from evalscope.config import TaskConfig
from evalscope.constants import EvalType, JudgeStrategy, OutputType
from evalscope.run import run_task
from evalscope.utils import is_module_installed, test_level_list
from evalscope.utils.logger import get_logger

os.environ['LOG_LEVEL'] = 'DEBUG'

logger = get_logger()


class TestRun(unittest.TestCase):

    def setUp(self) -> None:
        logger.info('Init env for evalscope native run UTs ...\n')
        self._check_env('evalscope')

    def tearDown(self) -> None:
        pass

    @staticmethod
    def _check_env(module_name: str):
        if is_module_installed(module_name):
            logger.info(f'{module_name} is installed.')
        else:
            raise ModuleNotFoundError(f'run: pip install {module_name}')

    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_simple_eval(self):
        model = 'qwen/Qwen2-0.5B-Instruct'
        datasets = 'arc'  # arc ceval
        limit = 10

        cmd_simple = f'evalscope eval ' \
                     f'--model {model} ' \
                     f'--datasets {datasets} ' \
                     f'--limit {limit}'

        logger.info(f'Start to run command: {cmd_simple}')
        run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
        logger.info(f'>>test_run_simple_eval stdout: {run_res.stdout}')
        logger.error(f'>>test_run_simple_eval stderr: {run_res.stderr}')

    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_eval_with_args(self):
        model = 'qwen/Qwen2-0.5B-Instruct'
        datasets = 'arc'  # arc ceval
        limit = 5
        dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'

        cmd_with_args = f'evalscope eval ' \
                        f'--model {model} ' \
                        f'--datasets {datasets} ' \
                        f'--limit {limit} ' \
                        f'--generation-config do_sample=false,temperature=0.0 ' \
                        f"""--dataset-args \'{dataset_args}\' """

        logger.info(f'Start to run command: {cmd_with_args}')
        run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'
        logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
        logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')

    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_yaml_config(self):
        from evalscope import run_task

        run_task(task_cfg='examples/tasks/eval_native.yaml')

    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_task(self):
        task_cfg = TaskConfig(
            model='qwen/Qwen2.5-0.5B-Instruct',
            datasets=[
                'iquiz',
                # 'ifeval',
                # 'mmlu',
                # 'mmlu_pro',
                # 'musr',
                # 'process_bench',
                # 'race',
                # 'trivia_qa',
                # 'cmmlu',
                # 'humaneval',
                # 'super_gpqa',
                # 'gsm8k',
                # 'bbh',
                # 'competition_math',
                # 'math_500',
                'aime24',
                'gpqa',
                # 'arc',
                # 'ceval',
                # 'hellaswag',
                # 'general_mcq',
                # 'general_qa'
            ],
            dataset_args={
                'mmlu': {
                    'subset_list': ['elementary_mathematics'],
                    'few_shot_num': 0
                },
                'mmlu_pro': {
                    'subset_list': ['math', 'health'],
                    'few_shot_num': 4
                },
                'ceval': {
                    'subset_list': [
                        'computer_network', 'operating_system', 'computer_architecture'
                    ],
                    'few_shot_num': 0
                },
                'cmmlu': {
                    'subset_list': ['elementary_chinese'],
                    'few_shot_num': 0
                },
                'bbh': {
                    'subset_list': ['word_sorting', 'movie_recommendation'],
                },
                'gpqa': {
                    'subset_list': ['gpqa_diamond'],
                    'few_shot_num': 0
                },
                'humaneval': {
                    'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
                },
                'competition_math': {
                    'subset_list': ['Level 1']
                },
                'process_bench': {
                    'subset_list': ['gsm8k'],
                },
                'musr': {
                    'subset_list': ['murder_mysteries'],
                },
                'general_mcq': {
                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
                    'subset_list': [
                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
                    ],
                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
                },
                'general_qa': {
                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
                    'subset_list': [
                        'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
                        # 'test'
                    ],
                    'metric_list': ['AverageBLEU']
                },
                'super_gpqa': {
                    'subset_list': ['Philosophy', 'Education'],
                    'few_shot_num': 0
                },
                'ifeval': {
                    'filters': {
                        'remove_until': '</think>'
                    }
                }
            },
            limit=2,
            eval_batch_size=2,
            generation_config={
                'max_new_tokens': 2048,
                'temperature': 0.7,
                'num_return_sequences': 1,
            },
            # debug=True
        )
        run_task(task_cfg=task_cfg)


    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_custom_task(self):
        from evalscope.config import TaskConfig

        task_cfg = TaskConfig(
            model='qwen/Qwen2-0.5B-Instruct',
            datasets=['general_mcq', 'general_qa'],  # 数据格式，选择题格式固定为 'ceval'
            dataset_args={
                'general_mcq': {
                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
                    'subset_list': [
                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
                    ],
                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
                },
                'general_qa': {
                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
                    'subset_list': [
                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
                    ]
                }
            },
        )
        res = run_task(task_cfg=task_cfg)
        print(res)

    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_one_task(self):
        from evalscope.config import TaskConfig

        task_cfg = TaskConfig(
            model='Qwen/Qwen3-1.7B',
            datasets=[
                'iquiz',
                # 'math_500',
                # 'aime24',
                # 'competition_math',
                # 'mmlu',
                # 'simple_qa',
            ],
            model_args={
                'device_map': 'auto',
            },
            dataset_args={
                'competition_math': {
                    'subset_list': ['Level 4', 'Level 5']
                },
                'mmlu': {
                    'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
                    'few_shot_num': 0
                },
            },
            limit=5,
            eval_batch_size=5,
            generation_config={
                'max_new_tokens': 1000,  # 最大生成token数，建议设置为较大值避免输出截断
                'temperature': 0.7,  # 采样温度 (qwen 报告推荐值)
                'top_p': 0.8,  # top-p采样 (qwen 报告推荐值)
                'top_k': 20,  # top-k采样 (qwen 报告推荐值)
                'chat_template_kwargs': {'enable_thinking': False}  # 关闭思考模式
            },
            judge_strategy=JudgeStrategy.AUTO,
        )

        run_task(task_cfg=task_cfg)


    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_task_loop(self):
        os.environ['CUDA_VISIBLE_DEVICES'] = '2'
        from evalscope.config import TaskConfig

        task_cfg1 = TaskConfig(
            model='Qwen/Qwen2.5-0.5B-Instruct',
            model_id='model1',
            datasets=['iquiz'],
            limit=10
        )
        task_cfg2 = TaskConfig(
            model='Qwen/Qwen2.5-0.5B-Instruct',
            model_id='model2',
            datasets=['iquiz'],
            limit=10
        )
        task_cfg3 = TaskConfig(
            model='Qwen/Qwen2.5-0.5B-Instruct',
            model_id='model3',
            datasets=['iquiz'],
            limit=10
        )

        run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])

    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_server_model(self):
        from evalscope.config import TaskConfig

        task_cfg = TaskConfig(
            model='qwen-plus',
            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
            api_key= env.get('DASHSCOPE_API_KEY'),
            eval_type=EvalType.SERVICE,
            datasets=[
                # 'iquiz',
                # 'ifeval',
                # 'mmlu',
                # 'mmlu_pro',
                # 'musr',
                # 'process_bench',
                # 'race',
                # 'trivia_qa',
                # 'cmmlu',
                # 'humaneval',
                # 'gsm8k',
                # 'bbh',
                # 'competition_math',
                # 'math_500',
                # 'aime24',
                # 'gpqa',
                # 'arc',
                # 'ceval',
                # 'hellaswag',
                # 'general_mcq',
                # 'general_qa',
                # 'super_gpqa',
                # 'mmlu_redux',
                # 'maritime_bench',
                # 'drop',
                # 'winogrande',
                'tool_bench',
            ],
            dataset_args={
                'mmlu': {
                    'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
                    'few_shot_num': 0
                },
                'mmlu_pro': {
                    'subset_list': ['math', 'health'],
                    'few_shot_num': 4
                },
                'ceval': {
                    'subset_list': [
                        'computer_network', 'operating_system', 'computer_architecture'
                    ],
                    'few_shot_num': 0
                },
                'cmmlu': {
                    'subset_list': ['elementary_chinese'],
                    'few_shot_num': 0
                },
                'bbh': {
                    'subset_list': ['word_sorting', 'movie_recommendation'],
                },
                'gpqa': {
                    # 'subset_list': ['gpqa_diamond'],
                    'few_shot_num': 0,
                    'local_path': './data/data/gpqa',
                },
                'humaneval': {
                    'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
                },
                'competition_math': {
                    'subset_list': ['Level 1']
                },
                'process_bench': {
                    'subset_list': ['gsm8k'],
                },
                'musr': {
                    'subset_list': ['murder_mysteries'],
                    'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
                },
                'general_mcq': {
                    'local_path': 'custom_eval/text/mcq',  # 自定义数据集路径
                    'subset_list': [
                        'example'  # 评测数据集名称，上述 *_dev.csv 中的 *
                    ],
                    'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}'  # 问题模板
                },
                'general_qa': {
                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
                    'subset_list': [
                        'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
                        # 'test'
                    ],
                    'metric_list': ['AverageRouge']
                },
                'super_gpqa': {
                    # 'subset_list': ['Philosophy', 'Education'],
                    'few_shot_num': 0
                },
                'mmlu_redux':{
                    'subset_list': ['abstract_algebra']
                },
            },
            eval_batch_size=32,
            limit=10,
            debug=True,
            stream=False,
            generation_config={
                'temperature': 0,
                'n': 1,
                'max_tokens': 4096,
            },
            # ignore_errors=True,
            use_cache='outputs/20250519_142106'
        )

        run_task(task_cfg=task_cfg)


    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_batch_eval(self):
        from evalscope.config import TaskConfig

        task_cfg = TaskConfig(
            model='LLM-Research/Llama-3.2-1B-Instruct',
            datasets=[
                # 'math_500',
                # 'aime24',
                # 'competition_math'
                # 'arc',
                'gsm8k'
                # 'truthful_qa'
            ],
            dataset_args={
                'competition_math': {
                    'subset_list': ['Level 4', 'Level 5']
                }
            },
            eval_batch_size=2,
            limit=5,
            generation_config={
                'max_new_tokens': 2048,
                'temperature': 0.7,
                'num_return_sequences': 2,
            }
        )

        run_task(task_cfg=task_cfg)

    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
    def test_run_judge_model(self):
        from evalscope.config import TaskConfig

        task_cfg = TaskConfig(
            model='qwen2.5-0.5b-instruct',
            api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
            api_key= env.get('DASHSCOPE_API_KEY'),
            eval_type=EvalType.SERVICE,
            datasets=[
                # 'math_500',
                # 'aime24',
                # 'competition_math',
                # 'arc',
                # 'gsm8k'
                # 'truthful_qa',
                # 'simple_qa',
                'chinese_simpleqa',
                # 'live_code_bench',
                # 'humaneval',
                # 'general_qa',
                # 'alpaca_eval',
                # 'arena_hard'
            ],
            dataset_args={
                'competition_math': {
                    'subset_list': ['Level 4']
                },
                'live_code_bench': {
                    'extra_params': {
                        'start_date': '2024-08-01',
                        'end_date': '2025-02-28'
                    },
                    'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
                },
                'general_qa': {
                    'local_path': 'custom_eval/text/qa',  # 自定义数据集路径
                    'subset_list': [
                        'example',  # 评测数据集名称，上述 *_dev.csv 中的 *
                        # 'test'
                    ]
                },
                'chinese_simpleqa': {
                    'subset_list': [
                        '中华文化'
                    ]
                },
            },
            eval_batch_size=10,
            limit=10,
            judge_strategy=JudgeStrategy.AUTO,
            judge_worker_num=5,
            judge_model_args={
                'model_id': 'qwen2.5-7b-instruct',
                'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
                'api_key': env.get('DASHSCOPE_API_KEY'),
                'generation_config': {
                    'temperature': 0.0,
                    'max_tokens': 4096
                }
            },
            generation_config={
                'max_new_tokens': 20000,
                'temperature': 0.0,
                'seed': 42,
                'n': 1
            },
            timeout=60000,
            stream=True,
            use_cache='outputs/20250519_142551'
        )

        run_task(task_cfg=task_cfg)

if __name__ == '__main__':
    unittest.main()