evalscope_v0.17.0/evalscope.0.17.0/evalscope/arguments.py

import argparse
import json

from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType


class ParseStrArgsAction(argparse.Action):

    def __call__(self, parser, namespace, values, option_string=None):
        assert isinstance(values, str), 'args should be a string.'

        # try json load first
        try:
            arg_dict = json.loads(values)
            setattr(namespace, self.dest, arg_dict)
            return
        except (json.JSONDecodeError, ValueError):
            pass

        # If JSON load fails, fall back to parsing as key=value pairs
        arg_dict = {}
        for arg in values.strip().split(','):
            key, value = map(str.strip, arg.split('=', 1))  # Use maxsplit=1 to handle multiple '='
            try:
                # Safely evaluate the value using eval
                arg_dict[key] = eval(value)
            except Exception:
                # If eval fails, check if it's a boolean value
                value_lower = value.lower()
                if value_lower == 'true':
                    arg_dict[key] = True
                elif value_lower == 'false':
                    arg_dict[key] = False
                else:
                    # If not a boolean, keep the original string
                    arg_dict[key] = value

        setattr(namespace, self.dest, arg_dict)


def add_argument(parser: argparse.ArgumentParser):
    # yapf: disable
    # Model-related arguments
    parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
    parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
    parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
    parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.')  # noqa: E501

    # Template-related arguments
    parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
    parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.')  # noqa: E501

    # Dataset-related arguments
    parser.add_argument('--datasets', type=str, nargs='+', required=False, help='Dataset id list, align to the module name in evalscope.benchmarks')  # noqa: E501
    parser.add_argument('--dataset-args', type=json.loads, default='{}', help='The dataset args, should be a json string.')  # noqa: E501
    parser.add_argument('--dataset-dir', help='The datasets dir.')
    parser.add_argument('--dataset-hub', help='The datasets hub.')

    # Generation configuration arguments
    parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.')  # noqa: E501

    # Evaluation-related arguments
    parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
                        choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
                        choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL])  # noqa: E501
    parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.')  # noqa: E501
    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
                        choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
    parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
    parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')

    # Cache and working directory arguments
    parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.')  # noqa: E501
    parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
    parser.add_argument('--work-dir', type=str, help='The root cache dir.')

    # Debug and runtime mode arguments
    parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
    parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.')  # noqa: E501
    parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
    parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
    parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
    parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
    parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
    parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.')  # noqa: E501

    # LLMJudge arguments
    parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
    parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.')  # noqa: E501
    parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
    parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.')  # noqa: E501
    # yapf: enable


def parse_args():
    parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
    add_argument(parser)

    args = parser.parse_args()
    return args