evalscope_v0.17.0/evalscope.0.17.0/evalscope/backend/opencompass/backend_manager.py

# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import subprocess
import tempfile
from dataclasses import asdict
from enum import Enum
from typing import Optional, Union

from evalscope.backend.base import BackendManager
from evalscope.backend.opencompass.api_meta_template import get_template
from evalscope.utils.import_utils import get_module_path, is_module_installed
from evalscope.utils.io_utils import get_valid_list
from evalscope.utils.logger import get_logger

logger = get_logger()


class CmdMode(Enum):

    # The basic mode is to run the command directly,
    # e.g. `python -m run --models model1 model2 --datasets dataset1 dataset2`
    BASIC = 'basic'

    # The script mode is to run the command with a script,
    # e.g. `python -m run your_config_script.py`
    SCRIPT = 'script'


class RunMode(Enum):

    # The command mode is to run the command directly with the command line.
    CMD = 'cmd'

    # The function mode is to run the command with a function call -- run_task().
    FUNCTION = 'function'


class OpenCompassBackendManager(BackendManager):

    def __init__(self, config: Union[str, dict], **kwargs):
        """
        The backend manager for OpenCompass.

        Args:
            config: Union[str, dict], the configuration yaml-file or the configuration dictionary.
                attributes:
                    datasets: list, the datasets.
                    models: list, the models.
                    work_dir (Optional): str, the working directory. Default to None, which means the current directory.
                    dry_run (Optional): bool, the dry-run flag. Default to False.
                    debug (Optional): bool, the debug flag. Default to False.
                    reuse (Optional): str, reuse previous outputs & results. Default to None.
                    generation_kwargs (Optional): dict, the generation config. Default to {}.
                    limit (Optional): int or float or str, the limit of the number of examples. Default to None.
                        if limit is a string, it should be in the format of '[start:end]'.

                example:
                    # TODO: add demo config
                    config = dict(
                        datasets=[mmlu, ceval],
                        models=[...],
                        ...
                    )

            **kwargs: the keyword arguments.
        """

        self._check_env()
        super().__init__(config, **kwargs)

        from opencompass.cli.arguments import Arguments as OpenCompassArguments
        self.args = OpenCompassArguments(**self.config_d)

    @property
    def cmd(self):
        return self.get_cmd()

    @staticmethod
    def _check_env():
        if is_module_installed('opencompass'):
            logger.info('Check the OpenCompass environment: OK')

    @staticmethod
    def get_restore_arg(arg_name: str, arg_val: bool):
        if arg_val:
            return f'--{arg_name}'
        else:
            return ''

    @staticmethod
    def get_arg_with_default(arg_name: str, arg_val: Optional[str] = None):
        if arg_val:
            return f'--{arg_name} {arg_val}'
        else:
            return ''

    def load_task_template(self):
        """
        Load the initial OpenCompass task template from task config file.

        Returns:
            (mmengine.config.config.Config), the initial task template config.
        """
        from opencompass.utils.run import get_config_from_arg

        template_config_path = get_module_path('evalscope.backend.opencompass.tasks.eval_api')
        self.args.config = template_config_path
        return get_config_from_arg(self.args)

    @staticmethod
    def list_datasets(return_details: bool = False):
        from dataclasses import dataclass
        from opencompass.utils.run import get_config_from_arg

        @dataclass
        class TempArgs:
            config: str
            accelerator: str = None

        template_config_path = get_module_path('evalscope.backend.opencompass.tasks.eval_api')
        template_cfg = get_config_from_arg(TempArgs(config=template_config_path))

        # e.g. ['mmlu', 'ceval', 'openai_humaneval', ...]
        dataset_show_names = list(set([_dataset['dataset_name'] for _dataset in template_cfg.datasets]))

        if return_details:
            return dataset_show_names, template_cfg.datasets
        else:
            return dataset_show_names

    def get_task_args(self):
        return self.args

    def get_cmd(self, cmd_mode: str = CmdMode.BASIC):

        if cmd_mode == CmdMode.BASIC:
            assert self.args.datasets, 'The datasets are required.'
            assert self.args.models, 'The models are required.'

            cmd_str = f'python -m run_oc ' \
                      f'--models {" ".join(self.args.models)} ' \
                      f'--datasets {" ".join(self.args.datasets)} ' \
                      f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
                      f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'

        elif cmd_mode == CmdMode.SCRIPT:
            assert self.args.config, 'The script file is required.'
            cmd_str = f'python -m run_oc {self.args.config}'
        else:
            raise ValueError(f'Unsupported command mode: {cmd_mode}')

        return cmd_str

    def run(self, run_mode: str = RunMode.FUNCTION):
        """
        The entry function to run the OpenCompass task.

        Args:
            run_mode: str, the running mode, e.g. 'function' or 'cmd'.

        Returns:
            None
        """
        if run_mode == RunMode.FUNCTION:
            from opencompass.cli.arguments import ApiModelConfig
            from opencompass.cli.main import run_task

            assert isinstance(self.args.models, list) and len(self.args.models) > 0, 'The models are required.'

            tmp_model_d: dict = self.args.models[0]
            assert 'path' in tmp_model_d and 'openai_api_base' in tmp_model_d, \
                f'Got invalid model config: {tmp_model_d}. \nTo get valid format: ' \
                "{'path': 'qwen-7b-chat', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}"

            # Get valid datasets
            dataset_names = self.args.datasets  # e.g. ['mmlu', 'ceval']
            dataset_names_all, real_dataset_all = self.list_datasets(return_details=True)

            if not dataset_names:
                logger.warning(f'No datasets are specified in the config. Use all the datasets: {dataset_names_all}')
                valid_dataset_names = dataset_names_all
            else:
                valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
                if len(invalid_dataset_names) > 0:
                    logger.error(f'Invalid datasets: {invalid_dataset_names}, '
                                 f'refer to the following list to get proper dataset name: {dataset_names_all}')
                assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
                                                     f'To get the valid datasets, please refer to {dataset_names_all}'

            valid_datasets = [
                _dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names
            ]
            for _dataset in valid_datasets:
                _dataset.pop('dataset_name')
                _dataset['reader_cfg']['test_range'] = self.args.limit

            # Get valid models
            models = []
            for model_d in self.args.models:
                # model_d: {'path': 'qwen-7b-chat',
                #           'meta_template': 'default-api-meta-template-oc',   # Optional
                #           'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}
                # Note: 'meta_template' can be a dict or a string, default is None

                if 'meta_template' in model_d and isinstance(model_d['meta_template'], str):
                    model_d['meta_template'] = get_template(model_d['meta_template'])

                # set the 'abbr' as the 'path' if 'abbr' is not specified
                model_d['abbr'] = os.path.basename(model_d['path'])

                model_config = ApiModelConfig(**model_d)
                models.append(asdict(model_config))

            # Load the initial task template and override configs
            template_cfg = self.load_task_template()
            template_cfg.datasets = valid_datasets
            template_cfg.models = models

            # Dump task config to a temporary file
            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.py', mode='w')
            template_cfg.dump(tmp_file.name)
            # logger.info(f'The task config is dumped to: {tmp_file.name}')
            self.args.config = tmp_file.name

            # Submit the task
            logger.info(f'*** Run task with config: {self.args.config} \n')
            run_task(self.args)

        # TODO: add more arguments for the command line
        elif run_mode == RunMode.CMD:
            subprocess.run(self.cmd, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        else:
            raise ValueError(f'Unsupported run mode: {run_mode}')


if __name__ == '__main__':

    # OpenCompassBackendManager.list_datasets()
    # ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC',
    # 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa',
    # 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c',
    # 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval',
    # 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']

    # 'meta_template': 'default-api-meta-template-oc',
    # models: llama3-8b-instruct, qwen-7b-chat
    oc_backend_manager = OpenCompassBackendManager(
        config={
            'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
            'models': [{
                'path': 'llama3-8b-instruct',
                'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
            }],
            'limit': 5
        })
    all_datasets = OpenCompassBackendManager.list_datasets()
    print(f'all_datasets: {all_datasets}')
    oc_backend_manager.run()