evalscope_v0.17.0/evalscope.0.17.0/evalscope/perf/plugin/api/openai_api.py

174 lines
7.6 KiB
Python

import json
import os
from typing import Any, Dict, Iterator, List, Union
from evalscope.perf.arguments import Arguments
from evalscope.perf.plugin.api.base import ApiPluginBase
from evalscope.perf.plugin.registry import register_api
from evalscope.utils.logger import get_logger
logger = get_logger()
@register_api(['openai', 'local_vllm', 'local'])
class OpenaiPlugin(ApiPluginBase):
"""Base of openai interface."""
def __init__(self, mode_path: str):
"""Init the plugin
Args:
mode_path (str): The model path, we use the tokenizer
weight in the model to calculate the number of the
input and output tokens.
"""
super().__init__(model_path=mode_path)
if mode_path is not None:
from modelscope import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
else:
self.tokenizer = None
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
"""Build the openai format request based on prompt, dataset
Args:
message (List[Dict] | str): The basic message to generator query.
param (QueryParameters): The query parameters.
Raises:
Exception: NotImplemented
Returns:
Dict: The request body. None if prompt format is error.
"""
try:
if param.query_template is not None:
if param.query_template.startswith('@'):
file_path = param.query_template[1:]
if os.path.exists(file_path):
with open(file_path, 'r') as file:
query = json.load(file)
else:
raise FileNotFoundError(f'{file_path}')
else:
query = json.loads(param.query_template)
if 'stream' in query.keys():
param.stream = query['stream']
# replace template messages with input messages.
query['messages'] = messages
elif isinstance(messages, str):
query = {'prompt': messages}
else:
query = {'messages': messages}
return self.__compose_query_from_parameter(query, param)
except Exception as e:
logger.exception(e)
return None
def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
payload['model'] = param.model
if param.max_tokens is not None:
payload['max_tokens'] = param.max_tokens
if param.min_tokens is not None:
payload['min_tokens'] = param.min_tokens
if param.frequency_penalty is not None:
payload['frequency_penalty'] = param.frequency_penalty
if param.repetition_penalty is not None:
payload['repetition_penalty'] = param.repetition_penalty
if param.logprobs is not None:
payload['logprobs'] = param.logprobs
if param.n_choices is not None:
payload['n'] = param.n_choices
if param.seed is not None:
payload['seed'] = param.seed
if param.stop is not None:
payload['stop'] = param.stop
if param.stream is not None and param.stream:
payload['stream'] = param.stream
payload['stream_options'] = {'include_usage': True}
if param.stop_token_ids is not None:
payload['stop_token_ids'] = param.stop_token_ids
if param.temperature is not None:
payload['temperature'] = param.temperature
if param.top_p is not None:
payload['top_p'] = param.top_p
if param.top_k is not None:
payload['top_k'] = param.top_k
if param.extra_args is not None:
payload.update(param.extra_args)
return payload
def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
"""Parser responses and return number of request and response tokens.
Only one response for non-stream, multiple responses for stream.
"""
# when stream, the last response is the full usage
# when non-stream, the last response is the first response
last_response_js = json.loads(responses[-1])
if 'usage' in last_response_js and last_response_js['usage']:
input_tokens = last_response_js['usage']['prompt_tokens']
output_tokens = last_response_js['usage']['completion_tokens']
return input_tokens, output_tokens
# no usage information in the response, parse the response to get the tokens
delta_contents = {}
for response in responses:
js = json.loads(response)
if 'object' in js:
self.__process_response_object(js, delta_contents)
else:
self.__process_no_object(js, delta_contents)
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
return input_tokens, output_tokens
def __process_response_object(self, js, delta_contents):
if js['object'] == 'chat.completion':
for choice in js['choices']:
delta_contents[choice['index']] = [choice['message']['content']]
elif js['object'] == 'text_completion':
for choice in js['choices']:
delta_contents[choice['index']] = [choice['text']]
elif js['object'] == 'chat.completion.chunk':
for choice in js.get('choices', []):
if 'delta' in choice and 'index' in choice:
delta = choice['delta']
idx = choice['index']
if 'content' in delta:
delta_content = delta['content']
delta_contents.setdefault(idx, []).append(delta_content)
def __process_no_object(self, js, delta_contents):
# assume the response is a single choice
for choice in js['choices']:
if 'delta' in choice:
delta = choice['delta']
idx = choice['index']
if 'content' in delta:
delta_content = delta['content']
delta_contents.setdefault(idx, []).append(delta_content)
else:
delta_contents[choice['index']] = [choice['message']['content']]
def __calculate_tokens_from_content(self, request, delta_contents):
input_tokens = output_tokens = 0
if self.tokenizer is not None:
for idx, choice_contents in delta_contents.items():
full_response_content = ''.join(choice_contents)
input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
output_tokens += len(self.tokenizer.encode(full_response_content))
else:
raise ValueError('Error: Unable to retrieve usage information\n\n'
'This error occurs when:\n'
'1. The API response does not contain usage data, AND\n'
'2. No tokenizer has been specified or found.\n\n'
'To resolve this issue, do ONE of the following:\n'
"a) Ensure that the API you're using supports and returns usage information, OR\n"
'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
'If you continue to experience issues, '
'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .')
return input_tokens, output_tokens