evalscope_v0.17.0/evalscope.0.17.0/evalscope/perf/plugin/api/custom_api.py

93 lines
3.7 KiB
Python

import json
from typing import Any, Dict, Iterator, List
from evalscope.perf.arguments import Arguments
from evalscope.perf.plugin.api.base import ApiPluginBase
from evalscope.perf.plugin.registry import register_api
from evalscope.utils.logger import get_logger
logger = get_logger()
@register_api('custom')
class CustomPlugin(ApiPluginBase):
"""Support tensorrt-llm triton server
"""
def __init__(self, mode_path: str):
"""Init the plugin
Args:
mode_path (str): The model path, we use the tokenizer
weight in the model to calculate the number of the
input and output tokens.
"""
super().__init__(model_path=mode_path)
if mode_path is not None:
from modelscope import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
else:
self.tokenizer = None
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
"""Build the openai format request based on prompt, dataset
Args:
message (Dict): The basic message to generator query.
param (Arguments): The query parameters.
Raises:
Exception: NotImplemented
Returns:
Dict: The request body. None if prompt format is error.
"""
try:
query = json.loads(param.query_template)
ApiPluginBase.replace_values(query, param.model, messages[0]['content'])
return query
except Exception as e:
logger.exception(e)
logger.error('Prompt: %s invalidate!' % messages)
return None
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
"""Parser responses and return number of request and response tokens.
sample of the output delta:
{"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
Args:
responses (List[bytes]): List of http response body, for stream output,
there are multiple responses, for general only one.
kwargs: (Any): The command line --parameter content.
Returns:
Tuple: Return number of prompt token and number of completion tokens.
"""
full_response_content = ''
delta_contents = {}
input_tokens = None
output_tokens = None
for response in responses:
data = json.loads(response)
# {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble",
# "model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"性"}
if 'text_output' in data:
if 0 in delta_contents:
delta_contents[0].append(data['text_output'])
else:
delta_contents[0] = [data['text_output']]
if input_tokens is None and output_tokens is None and self.tokenizer is not None:
input_tokens = 0
output_tokens = 0
for _, choice_contents in delta_contents.items():
full_response_content = ''.join([m for m in choice_contents])
input_tokens += len(self.tokenizer.encode(request['text_input']))
output_tokens += len(self.tokenizer.encode(full_response_content))
elif input_tokens is None and output_tokens is None: # no usage info get.
input_tokens = 0
output_tokens = 0
logger.warning('No usage info get.')
return input_tokens, output_tokens