# Copyright (c) Alibaba, Inc. and its affiliates. import json import os import time from typing import List from evalscope.models.custom import CustomModel from evalscope.run import run_task from evalscope.summarizer import Summarizer from evalscope.utils.io_utils import yaml_to_dict from evalscope.utils.logger import get_logger logger = get_logger() class SwiftModel(CustomModel): def __init__(self, config: dict, **kwargs): super(SwiftModel, self).__init__(config=config, **kwargs) def predict(self, prompts: str, **kwargs): # query = '浙江的省会在哪里?' # prompts = [query] # response, history = self.inference(self.model, self.template, prompts) # response: str = str(response) # ONLY FOR TEST response = 'The answer is C.' res_d: dict = { 'choices': [{ 'index': 0, 'message': { # 'content': f'The answer is B. Raw prompt: {prompt}', 'content': response, 'role': 'assistant' } }], 'created': time.time(), 'model': self.config.get('model_id'), # should be model_id 'object': 'chat.completion', 'usage': { 'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0 } } return [res_d for _ in prompts] if __name__ == '__main__': from evalscope.config import TaskConfig swift_model = SwiftModel(config={'model_id': 'swift_grok-base-dummy'}) print(TaskConfig.list()) # ['arc', 'gsm8k'] # 'arc', 'gsm8k', 'bbh_mini', 'mmlu_mini', 'ceval_mini' # Customize your own dataset, refer to datasets: # wget https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/benchmark/data.zip # unzip data.zip custom_dataset_name = 'general_qa_swift_custom_dataset' custom_dataset_pattern = 'general_qa' # 可选范围: ['arc', 'gsm8k', 'mmlu', 'ceval', 'bbh'] TaskConfig.registry( name=custom_dataset_name, data_pattern=custom_dataset_pattern, dataset_dir='/path/to/general_qa_swift', # subset_list=['my_swift_custom_subset1', 'my_swift_custom_subset2'], ) # Load the task config list task_config_list = TaskConfig.load(custom_model=swift_model, tasks=[custom_dataset_name, 'arc']) # You can update the task_config with your own settings for config_item in task_config_list: config_item.limit = 20 # Note: limit the number of each subset to evaluate; default is None config_item.use_cache = False print(task_config_list) eval_results: dict = run_task(task_cfg=task_config_list) print('** Evaluation results finished !\n') # Get the final report for your evaluation task final_report: List[dict] = Summarizer.get_report_from_cfg(task_cfg=task_config_list) print(f'*** Final report ***\n {json.dumps(final_report, ensure_ascii=False)}\n')