evalscope_v0.17.0/evalscope.0.17.0/evalscope/arguments.py

102 lines
5.9 KiB
Python

import argparse
import json
from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
class ParseStrArgsAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
assert isinstance(values, str), 'args should be a string.'
# try json load first
try:
arg_dict = json.loads(values)
setattr(namespace, self.dest, arg_dict)
return
except (json.JSONDecodeError, ValueError):
pass
# If JSON load fails, fall back to parsing as key=value pairs
arg_dict = {}
for arg in values.strip().split(','):
key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
try:
# Safely evaluate the value using eval
arg_dict[key] = eval(value)
except Exception:
# If eval fails, check if it's a boolean value
value_lower = value.lower()
if value_lower == 'true':
arg_dict[key] = True
elif value_lower == 'false':
arg_dict[key] = False
else:
# If not a boolean, keep the original string
arg_dict[key] = value
setattr(namespace, self.dest, arg_dict)
def add_argument(parser: argparse.ArgumentParser):
# yapf: disable
# Model-related arguments
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
# Template-related arguments
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
# Dataset-related arguments
parser.add_argument('--datasets', type=str, nargs='+', required=False, help='Dataset id list, align to the module name in evalscope.benchmarks') # noqa: E501
parser.add_argument('--dataset-args', type=json.loads, default='{}', help='The dataset args, should be a json string.') # noqa: E501
parser.add_argument('--dataset-dir', help='The datasets dir.')
parser.add_argument('--dataset-hub', help='The datasets hub.')
# Generation configuration arguments
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
# Evaluation-related arguments
parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
# Cache and working directory arguments
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
# Debug and runtime mode arguments
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
# LLMJudge arguments
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
# yapf: enable
def parse_args():
parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
add_argument(parser)
args = parser.parse_args()
return args