102 lines
5.9 KiB
Python
102 lines
5.9 KiB
Python
import argparse
|
|
import json
|
|
|
|
from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
|
|
|
|
|
|
class ParseStrArgsAction(argparse.Action):
|
|
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
assert isinstance(values, str), 'args should be a string.'
|
|
|
|
# try json load first
|
|
try:
|
|
arg_dict = json.loads(values)
|
|
setattr(namespace, self.dest, arg_dict)
|
|
return
|
|
except (json.JSONDecodeError, ValueError):
|
|
pass
|
|
|
|
# If JSON load fails, fall back to parsing as key=value pairs
|
|
arg_dict = {}
|
|
for arg in values.strip().split(','):
|
|
key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
|
|
try:
|
|
# Safely evaluate the value using eval
|
|
arg_dict[key] = eval(value)
|
|
except Exception:
|
|
# If eval fails, check if it's a boolean value
|
|
value_lower = value.lower()
|
|
if value_lower == 'true':
|
|
arg_dict[key] = True
|
|
elif value_lower == 'false':
|
|
arg_dict[key] = False
|
|
else:
|
|
# If not a boolean, keep the original string
|
|
arg_dict[key] = value
|
|
|
|
setattr(namespace, self.dest, arg_dict)
|
|
|
|
|
|
def add_argument(parser: argparse.ArgumentParser):
|
|
# yapf: disable
|
|
# Model-related arguments
|
|
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
|
|
parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
|
|
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
|
|
parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
|
|
|
|
# Template-related arguments
|
|
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
|
|
parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
|
|
|
|
# Dataset-related arguments
|
|
parser.add_argument('--datasets', type=str, nargs='+', required=False, help='Dataset id list, align to the module name in evalscope.benchmarks') # noqa: E501
|
|
parser.add_argument('--dataset-args', type=json.loads, default='{}', help='The dataset args, should be a json string.') # noqa: E501
|
|
parser.add_argument('--dataset-dir', help='The datasets dir.')
|
|
parser.add_argument('--dataset-hub', help='The datasets hub.')
|
|
|
|
# Generation configuration arguments
|
|
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
|
|
# Evaluation-related arguments
|
|
parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
|
|
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
|
|
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
|
|
# Cache and working directory arguments
|
|
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
|
|
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
|
|
|
|
# Debug and runtime mode arguments
|
|
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
|
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
|
|
parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
|
|
|
|
# LLMJudge arguments
|
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
# yapf: enable
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
|
|
add_argument(parser)
|
|
|
|
args = parser.parse_args()
|
|
return args
|