100 lines
3.1 KiB
Python
100 lines
3.1 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
import time
|
|
from typing import List
|
|
|
|
from evalscope.models import CustomModel
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
class DummyCustomModel(CustomModel):
|
|
|
|
def __init__(self, config: dict = {}, **kwargs):
|
|
super(DummyCustomModel, self).__init__(config=config, **kwargs)
|
|
|
|
def make_request_messages(self, input_item: dict) -> list:
|
|
"""
|
|
Make request messages for OpenAI API.
|
|
"""
|
|
if input_item.get('messages', None):
|
|
return input_item['messages']
|
|
|
|
data: list = input_item['data']
|
|
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
query = '\n'.join(''.join(item) for item in data)
|
|
system_prompt = input_item.get('system_prompt', None)
|
|
else:
|
|
query = data[0]
|
|
system_prompt = input_item.get('system_prompt', None)
|
|
|
|
messages = []
|
|
if system_prompt:
|
|
messages.append({'role': 'system', 'content': system_prompt})
|
|
|
|
messages.append({'role': 'user', 'content': query})
|
|
|
|
return messages
|
|
|
|
def predict(self, prompts: List[dict], **kwargs):
|
|
original_inputs = kwargs.get('origin_inputs', None)
|
|
infer_cfg = kwargs.get('infer_cfg', None)
|
|
|
|
logger.debug(f'** Prompts: {prompts}')
|
|
if original_inputs is not None:
|
|
logger.debug(f'** Original inputs: {original_inputs}')
|
|
if infer_cfg is not None:
|
|
logger.debug(f'** Inference config: {infer_cfg}')
|
|
|
|
# Simulate a response based on the prompts
|
|
# Must return a list of dicts with the same format as the OpenAI API.
|
|
responses = []
|
|
for input_item in original_inputs:
|
|
# message = self.make_request_messages(input_item)
|
|
# response = f'Dummy response for prompt: {message}'
|
|
|
|
res_d = {
|
|
'choices': [{
|
|
'index': 0,
|
|
'message': {
|
|
'content': '*PlaceHolder*',
|
|
'role': 'assistant'
|
|
}
|
|
}],
|
|
'created': time.time(),
|
|
'model': self.config.get('model_id'),
|
|
'object': 'chat.completion',
|
|
'usage': {
|
|
'completion_tokens': 0,
|
|
'prompt_tokens': 0,
|
|
'total_tokens': 0
|
|
}
|
|
}
|
|
|
|
responses.append(res_d)
|
|
|
|
return responses
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from evalscope import TaskConfig, run_task
|
|
|
|
dummy_model = DummyCustomModel()
|
|
task_config = TaskConfig(
|
|
model=dummy_model,
|
|
model_id='evalscope-model-dummy',
|
|
datasets=['gsm8k'],
|
|
eval_type='custom', # must be custom for custom model evaluation
|
|
generation_config={
|
|
'max_new_tokens': 100,
|
|
'temperature': 0.0,
|
|
'top_p': 1.0,
|
|
'top_k': 50,
|
|
'repetition_penalty': 1.0
|
|
},
|
|
debug=True,
|
|
limit=5,
|
|
)
|
|
|
|
eval_results = run_task(task_cfg=task_config)
|