evalscope/tests/cli/test_run.py

501 lines
17 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) Alibaba, Inc. and its affiliates.
from dotenv import dotenv_values
env = dotenv_values('.env')
import os
import subprocess
import unittest
from evalscope.config import TaskConfig
from evalscope.constants import EvalType, JudgeStrategy, OutputType
from evalscope.run import run_task
from evalscope.utils import is_module_installed, test_level_list
from evalscope.utils.logger import get_logger
os.environ['LOG_LEVEL'] = 'DEBUG'
logger = get_logger()
class TestRun(unittest.TestCase):
def setUp(self) -> None:
logger.info('Init env for evalscope native run UTs ...\n')
self._check_env('evalscope')
def tearDown(self) -> None:
pass
@staticmethod
def _check_env(module_name: str):
if is_module_installed(module_name):
logger.info(f'{module_name} is installed.')
else:
raise ModuleNotFoundError(f'run: pip install {module_name}')
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_simple_eval(self):
model = 'qwen/Qwen2-0.5B-Instruct'
datasets = 'arc' # arc ceval
limit = 10
cmd_simple = f'evalscope eval ' \
f'--model {model} ' \
f'--datasets {datasets} ' \
f'--limit {limit}'
logger.info(f'Start to run command: {cmd_simple}')
run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
logger.info(f'>>test_run_simple_eval stdout: {run_res.stdout}')
logger.error(f'>>test_run_simple_eval stderr: {run_res.stderr}')
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_eval_with_args(self):
model = 'qwen/Qwen2-0.5B-Instruct'
datasets = 'arc' # arc ceval
limit = 5
dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'
cmd_with_args = f'evalscope eval ' \
f'--model {model} ' \
f'--datasets {datasets} ' \
f'--limit {limit} ' \
f'--generation-config do_sample=false,temperature=0.0 ' \
f"""--dataset-args \'{dataset_args}\' """
logger.info(f'Start to run command: {cmd_with_args}')
run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'
logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_yaml_config(self):
from evalscope import run_task
run_task(task_cfg='examples/tasks/eval_native.yaml')
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_task(self):
task_cfg = TaskConfig(
model='qwen/Qwen2.5-0.5B-Instruct',
datasets=[
'iquiz',
# 'ifeval',
# 'mmlu',
# 'mmlu_pro',
# 'musr',
# 'process_bench',
# 'race',
# 'trivia_qa',
# 'cmmlu',
# 'humaneval',
# 'super_gpqa',
# 'gsm8k',
# 'bbh',
# 'competition_math',
# 'math_500',
'aime24',
'gpqa',
# 'arc',
# 'ceval',
# 'hellaswag',
# 'general_mcq',
# 'general_qa'
],
dataset_args={
'mmlu': {
'subset_list': ['elementary_mathematics'],
'few_shot_num': 0
},
'mmlu_pro': {
'subset_list': ['math', 'health'],
'few_shot_num': 4
},
'ceval': {
'subset_list': [
'computer_network', 'operating_system', 'computer_architecture'
],
'few_shot_num': 0
},
'cmmlu': {
'subset_list': ['elementary_chinese'],
'few_shot_num': 0
},
'bbh': {
'subset_list': ['word_sorting', 'movie_recommendation'],
},
'gpqa': {
'subset_list': ['gpqa_diamond'],
'few_shot_num': 0
},
'humaneval': {
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
},
'competition_math': {
'subset_list': ['Level 1']
},
'process_bench': {
'subset_list': ['gsm8k'],
},
'musr': {
'subset_list': ['murder_mysteries'],
},
'general_mcq': {
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
'subset_list': [
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
],
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
},
'general_qa': {
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
'subset_list': [
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
# 'test'
],
'metric_list': ['AverageBLEU']
},
'super_gpqa': {
'subset_list': ['Philosophy', 'Education'],
'few_shot_num': 0
},
'ifeval': {
'filters': {
'remove_until': '</think>'
}
}
},
limit=2,
eval_batch_size=2,
generation_config={
'max_new_tokens': 2048,
'temperature': 0.7,
'num_return_sequences': 1,
},
# debug=True
)
run_task(task_cfg=task_cfg)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_custom_task(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='qwen/Qwen2-0.5B-Instruct',
datasets=['general_mcq', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
dataset_args={
'general_mcq': {
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
'subset_list': [
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
],
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
},
'general_qa': {
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
'subset_list': [
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
]
}
},
)
res = run_task(task_cfg=task_cfg)
print(res)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_one_task(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='Qwen/Qwen3-1.7B',
datasets=[
'iquiz',
# 'math_500',
# 'aime24',
# 'competition_math',
# 'mmlu',
# 'simple_qa',
],
model_args={
'device_map': 'auto',
},
dataset_args={
'competition_math': {
'subset_list': ['Level 4', 'Level 5']
},
'mmlu': {
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
'few_shot_num': 0
},
},
limit=5,
eval_batch_size=5,
generation_config={
'max_new_tokens': 1000, # 最大生成token数建议设置为较大值避免输出截断
'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
'top_k': 20, # top-k采样 (qwen 报告推荐值)
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
},
judge_strategy=JudgeStrategy.AUTO,
)
run_task(task_cfg=task_cfg)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_task_loop(self):
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
from evalscope.config import TaskConfig
task_cfg1 = TaskConfig(
model='Qwen/Qwen2.5-0.5B-Instruct',
model_id='model1',
datasets=['iquiz'],
limit=10
)
task_cfg2 = TaskConfig(
model='Qwen/Qwen2.5-0.5B-Instruct',
model_id='model2',
datasets=['iquiz'],
limit=10
)
task_cfg3 = TaskConfig(
model='Qwen/Qwen2.5-0.5B-Instruct',
model_id='model3',
datasets=['iquiz'],
limit=10
)
run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_server_model(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='qwen-plus',
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
api_key= env.get('DASHSCOPE_API_KEY'),
eval_type=EvalType.SERVICE,
datasets=[
# 'iquiz',
# 'ifeval',
# 'mmlu',
# 'mmlu_pro',
# 'musr',
# 'process_bench',
# 'race',
# 'trivia_qa',
# 'cmmlu',
# 'humaneval',
# 'gsm8k',
# 'bbh',
# 'competition_math',
# 'math_500',
# 'aime24',
# 'gpqa',
# 'arc',
# 'ceval',
# 'hellaswag',
# 'general_mcq',
# 'general_qa',
# 'super_gpqa',
# 'mmlu_redux',
# 'maritime_bench',
# 'drop',
# 'winogrande',
'tool_bench',
],
dataset_args={
'mmlu': {
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
'few_shot_num': 0
},
'mmlu_pro': {
'subset_list': ['math', 'health'],
'few_shot_num': 4
},
'ceval': {
'subset_list': [
'computer_network', 'operating_system', 'computer_architecture'
],
'few_shot_num': 0
},
'cmmlu': {
'subset_list': ['elementary_chinese'],
'few_shot_num': 0
},
'bbh': {
'subset_list': ['word_sorting', 'movie_recommendation'],
},
'gpqa': {
# 'subset_list': ['gpqa_diamond'],
'few_shot_num': 0,
'local_path': './data/data/gpqa',
},
'humaneval': {
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
},
'competition_math': {
'subset_list': ['Level 1']
},
'process_bench': {
'subset_list': ['gsm8k'],
},
'musr': {
'subset_list': ['murder_mysteries'],
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
},
'general_mcq': {
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
'subset_list': [
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
],
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
},
'general_qa': {
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
'subset_list': [
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
# 'test'
],
'metric_list': ['AverageRouge']
},
'super_gpqa': {
# 'subset_list': ['Philosophy', 'Education'],
'few_shot_num': 0
},
'mmlu_redux':{
'subset_list': ['abstract_algebra']
},
},
eval_batch_size=32,
limit=10,
debug=True,
stream=False,
generation_config={
'temperature': 0,
'n': 1,
'max_tokens': 4096,
},
# ignore_errors=True,
use_cache='outputs/20250519_142106'
)
run_task(task_cfg=task_cfg)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_batch_eval(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='LLM-Research/Llama-3.2-1B-Instruct',
datasets=[
# 'math_500',
# 'aime24',
# 'competition_math'
# 'arc',
'gsm8k'
# 'truthful_qa'
],
dataset_args={
'competition_math': {
'subset_list': ['Level 4', 'Level 5']
}
},
eval_batch_size=2,
limit=5,
generation_config={
'max_new_tokens': 2048,
'temperature': 0.7,
'num_return_sequences': 2,
}
)
run_task(task_cfg=task_cfg)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_judge_model(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='qwen2.5-0.5b-instruct',
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
api_key= env.get('DASHSCOPE_API_KEY'),
eval_type=EvalType.SERVICE,
datasets=[
# 'math_500',
# 'aime24',
# 'competition_math',
# 'arc',
# 'gsm8k'
# 'truthful_qa',
# 'simple_qa',
'chinese_simpleqa',
# 'live_code_bench',
# 'humaneval',
# 'general_qa',
# 'alpaca_eval',
# 'arena_hard'
],
dataset_args={
'competition_math': {
'subset_list': ['Level 4']
},
'live_code_bench': {
'extra_params': {
'start_date': '2024-08-01',
'end_date': '2025-02-28'
},
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
},
'general_qa': {
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
'subset_list': [
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
# 'test'
]
},
'chinese_simpleqa': {
'subset_list': [
'中华文化'
]
},
},
eval_batch_size=10,
limit=10,
judge_strategy=JudgeStrategy.AUTO,
judge_worker_num=5,
judge_model_args={
'model_id': 'qwen2.5-7b-instruct',
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
'api_key': env.get('DASHSCOPE_API_KEY'),
'generation_config': {
'temperature': 0.0,
'max_tokens': 4096
}
},
generation_config={
'max_new_tokens': 20000,
'temperature': 0.0,
'seed': 42,
'n': 1
},
timeout=60000,
stream=True,
use_cache='outputs/20250519_142551'
)
run_task(task_cfg=task_cfg)
if __name__ == '__main__':
unittest.main()