93 lines
3.7 KiB
Python
93 lines
3.7 KiB
Python
import json
|
|
from typing import Any, Dict, Iterator, List
|
|
|
|
from evalscope.perf.arguments import Arguments
|
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
from evalscope.perf.plugin.registry import register_api
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
@register_api('custom')
|
|
class CustomPlugin(ApiPluginBase):
|
|
"""Support tensorrt-llm triton server
|
|
"""
|
|
|
|
def __init__(self, mode_path: str):
|
|
"""Init the plugin
|
|
|
|
Args:
|
|
mode_path (str): The model path, we use the tokenizer
|
|
weight in the model to calculate the number of the
|
|
input and output tokens.
|
|
"""
|
|
super().__init__(model_path=mode_path)
|
|
if mode_path is not None:
|
|
from modelscope import AutoTokenizer
|
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
else:
|
|
self.tokenizer = None
|
|
|
|
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
|
|
"""Build the openai format request based on prompt, dataset
|
|
|
|
Args:
|
|
message (Dict): The basic message to generator query.
|
|
param (Arguments): The query parameters.
|
|
|
|
Raises:
|
|
Exception: NotImplemented
|
|
|
|
Returns:
|
|
Dict: The request body. None if prompt format is error.
|
|
"""
|
|
try:
|
|
query = json.loads(param.query_template)
|
|
ApiPluginBase.replace_values(query, param.model, messages[0]['content'])
|
|
return query
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
logger.error('Prompt: %s invalidate!' % messages)
|
|
return None
|
|
|
|
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
|
|
"""Parser responses and return number of request and response tokens.
|
|
sample of the output delta:
|
|
{"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
|
|
|
|
|
|
Args:
|
|
responses (List[bytes]): List of http response body, for stream output,
|
|
there are multiple responses, for general only one.
|
|
kwargs: (Any): The command line --parameter content.
|
|
Returns:
|
|
Tuple: Return number of prompt token and number of completion tokens.
|
|
"""
|
|
full_response_content = ''
|
|
delta_contents = {}
|
|
input_tokens = None
|
|
output_tokens = None
|
|
for response in responses:
|
|
data = json.loads(response)
|
|
# {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble",
|
|
# "model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"性"}
|
|
if 'text_output' in data:
|
|
if 0 in delta_contents:
|
|
delta_contents[0].append(data['text_output'])
|
|
else:
|
|
delta_contents[0] = [data['text_output']]
|
|
if input_tokens is None and output_tokens is None and self.tokenizer is not None:
|
|
input_tokens = 0
|
|
output_tokens = 0
|
|
for _, choice_contents in delta_contents.items():
|
|
full_response_content = ''.join([m for m in choice_contents])
|
|
input_tokens += len(self.tokenizer.encode(request['text_input']))
|
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
elif input_tokens is None and output_tokens is None: # no usage info get.
|
|
input_tokens = 0
|
|
output_tokens = 0
|
|
logger.warning('No usage info get.')
|
|
|
|
return input_tokens, output_tokens
|