evalscope_v0.17.0/evalscope.0.17.0/tests/cli/test_custom.py

262 lines
8.6 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
from dotenv import dotenv_values
from tests.utils import test_level_list
env = dotenv_values('.env')
import os
import subprocess
import unittest
from evalscope.config import TaskConfig
from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
from evalscope.run import run_task
from evalscope.utils.import_utils import is_module_installed
from evalscope.utils.logger import get_logger
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
logger = get_logger()
class TestRunCustom(unittest.TestCase):
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_custom_task(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='Qwen/Qwen3-0.6B',
datasets=[
'general_mcq',
'general_qa'
],
dataset_args={
'general_mcq': {
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
'subset_list': [
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
],
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
},
'general_qa': {
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
'subset_list': [
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
]
}
},
)
res = run_task(task_cfg=task_cfg)
print(res)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_local_dataset(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='qwen-plus',
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
api_key= env.get('DASHSCOPE_API_KEY'),
eval_type=EvalType.SERVICE,
datasets=[
# 'mmlu',
# 'race',
'trivia_qa',
# 'cmmlu',
# 'humaneval',
# 'gsm8k',
# 'bbh',
# 'competition_math',
# 'arc',
# 'ceval',
],
dataset_args={
'mmlu': {
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
'few_shot_num': 0,
'dataset_id': 'data/data/mmlu',
},
'ceval': {
'subset_list': [
'computer_network', 'operating_system', 'computer_architecture'
],
'few_shot_num': 0,
'dataset_id': 'data/data/ceval',
},
'cmmlu': {
'subset_list': ['elementary_chinese'],
'dataset_id': 'data/data/cmmlu',
'few_shot_num': 0
},
'bbh': {
'subset_list': ['word_sorting', 'movie_recommendation'],
},
'humaneval': {
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
},
'trivia_qa': {
'dataset_id': 'data/data/trivia_qa',
},
},
eval_batch_size=10,
limit=5,
debug=True,
stream=True,
generation_config={
'temperature': 0,
'n': 1,
'max_tokens': 4096,
},
ignore_errors=False,
)
run_task(task_cfg=task_cfg)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_general_no_answer(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='qwen2.5-72b-instruct',
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
api_key= env.get('DASHSCOPE_API_KEY'),
eval_type=EvalType.SERVICE,
datasets=[
'general_qa',
],
dataset_args={
'general_qa': {
'dataset_id': 'custom_eval/text/qa',
'subset_list': [
'arena',
'example'
],
}
},
eval_batch_size=10,
limit=10,
debug=True,
stream=True,
generation_config={
'temperature': 0,
'n': 1,
'max_tokens': 4096,
},
ignore_errors=False,
judge_model_args={
'model_id': 'qwen2.5-72b-instruct',
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
'api_key': env.get('DASHSCOPE_API_KEY'),
'generation_config': {
'temperature': 0.0,
'max_tokens': 4096
},
'score_type': 'numeric',
},
judge_worker_num=5,
judge_strategy=JudgeStrategy.AUTO,
)
run_task(task_cfg=task_cfg)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_general_with_answer(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model='qwen-plus',
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
api_key= env.get('DASHSCOPE_API_KEY'),
eval_type=EvalType.SERVICE,
datasets=[
'general_qa',
],
dataset_args={
'general_qa': {
'dataset_id': 'custom_eval/text/qa',
'subset_list': [
'example'
],
}
},
eval_batch_size=10,
limit=10,
debug=True,
stream=True,
generation_config={
'temperature': 0,
'n': 1,
'max_tokens': 4096,
},
ignore_errors=False,
judge_model_args={
'model_id': 'qwen2.5-72b-instruct',
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
'api_key': env.get('DASHSCOPE_API_KEY'),
'generation_config': {
'temperature': 0.0,
'max_tokens': 4096
},
'score_type': 'pattern',
},
judge_worker_num=5,
judge_strategy=JudgeStrategy.LLM,
)
run_task(task_cfg=task_cfg)
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_general_arena(self):
from evalscope.config import TaskConfig
task_cfg = TaskConfig(
model_id='Arena',
datasets=[
'general_arena',
],
dataset_args={
'general_arena': {
'extra_params':{
'models':[
{
'name': 'qwen2.5-0.5b',
'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
},
{
'name': 'qwen2.5-7b',
'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
},
{
'name': 'qwen2.5-72b',
'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
}
],
'baseline': 'qwen2.5-7b'
}
}
},
eval_batch_size=10,
limit=10,
debug=True,
stream=True,
ignore_errors=False,
judge_model_args={
'model_id': 'qwen-plus',
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
'api_key': env.get('DASHSCOPE_API_KEY'),
'generation_config': {
'temperature': 0.0,
'max_tokens': 8000
},
},
judge_worker_num=5,
use_cache='outputs/20250702_165727'
)
run_task(task_cfg=task_cfg)