269 lines
9.6 KiB
Python
269 lines
9.6 KiB
Python
import base64
|
|
import json
|
|
import os
|
|
import pickle
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from datetime import datetime
|
|
from tabulate import tabulate
|
|
from typing import Dict, List, Tuple
|
|
|
|
from evalscope.perf.arguments import Arguments
|
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
def encode_data(data) -> str:
|
|
"""Encodes data using base64 and pickle."""
|
|
return base64.b64encode(pickle.dumps(data)).decode('utf-8')
|
|
|
|
|
|
def write_json_file(data, output_path):
|
|
with open(output_path, 'w') as f:
|
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
|
|
|
|
def transpose_results(data):
|
|
headers = data.keys()
|
|
rows = zip(*data.values())
|
|
|
|
return [dict(zip(headers, row)) for row in rows]
|
|
|
|
|
|
def create_result_table(cursor):
|
|
cursor.execute('''CREATE TABLE IF NOT EXISTS result(
|
|
request TEXT,
|
|
start_time REAL,
|
|
chunk_times TEXT,
|
|
success INTEGER,
|
|
response_messages TEXT,
|
|
completed_time REAL,
|
|
latency REAL,
|
|
first_chunk_latency REAL,
|
|
n_chunks INTEGER,
|
|
chunk_time REAL,
|
|
prompt_tokens INTEGER,
|
|
completion_tokens INTEGER,
|
|
max_gpu_memory_cost REAL)''')
|
|
|
|
|
|
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
request = encode_data(benchmark_data.request)
|
|
chunk_times = json.dumps(benchmark_data.chunk_times)
|
|
response_messages = encode_data(benchmark_data.response_messages)
|
|
|
|
# Columns common to both success and failure cases
|
|
common_columns = (
|
|
request,
|
|
benchmark_data.start_time,
|
|
chunk_times,
|
|
benchmark_data.success,
|
|
response_messages,
|
|
benchmark_data.completed_time,
|
|
)
|
|
|
|
if benchmark_data.success:
|
|
# Add additional columns for success case
|
|
additional_columns = (
|
|
benchmark_data.query_latency,
|
|
benchmark_data.first_chunk_latency,
|
|
benchmark_data.n_chunks,
|
|
benchmark_data.n_chunks_time,
|
|
benchmark_data.prompt_tokens,
|
|
benchmark_data.completion_tokens,
|
|
benchmark_data.max_gpu_memory_cost,
|
|
)
|
|
query = """INSERT INTO result(
|
|
request, start_time, chunk_times, success, response_messages,
|
|
completed_time, latency, first_chunk_latency,
|
|
n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
|
cursor.execute(query, common_columns + additional_columns)
|
|
else:
|
|
query = """INSERT INTO result(
|
|
request, start_time, chunk_times, success, response_messages, completed_time
|
|
) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
cursor.execute(query, common_columns)
|
|
|
|
|
|
def get_output_path(args: Arguments) -> str:
|
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
# Filter illegal characters
|
|
output_path = re.sub(r'[<>:"|?*]', '_', output_path)
|
|
if not os.path.exists(output_path):
|
|
os.makedirs(output_path, exist_ok=True)
|
|
logger.info(f'Save the result to: {output_path}')
|
|
return output_path
|
|
|
|
|
|
def get_result_db_path(args: Arguments):
|
|
result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')
|
|
|
|
logger.info(f'Save the data base to: {result_db_path}')
|
|
if os.path.exists(result_db_path):
|
|
logger.warning('The db file exists, delete it and start again!.')
|
|
sys.exit(1)
|
|
|
|
return result_db_path
|
|
|
|
|
|
class PercentileMetrics:
|
|
TTFT = 'TTFT (s)'
|
|
ITL = 'ITL (s)'
|
|
TPOT = 'TPOT (s)'
|
|
LATENCY = 'Latency (s)'
|
|
INPUT_TOKENS = 'Input tokens'
|
|
OUTPUT_TOKENS = 'Output tokens'
|
|
OUTPUT_THROUGHPUT = 'Output (tok/s)'
|
|
TOTAL_THROUGHPUT = 'Total (tok/s)'
|
|
PERCENTILES = 'Percentiles'
|
|
|
|
|
|
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
"""
|
|
Calculate the percentiles for a specific list of data.
|
|
|
|
:param data: List of values for a specific metric.
|
|
:param percentiles: List of percentiles to calculate.
|
|
:return: Dictionary of calculated percentiles.
|
|
"""
|
|
results = {}
|
|
n_success_queries = len(data)
|
|
data.sort()
|
|
for percentile in percentiles:
|
|
try:
|
|
idx = int(n_success_queries * percentile / 100)
|
|
value = data[idx] if data[idx] is not None else float('nan')
|
|
results[percentile] = round(value, 4)
|
|
except IndexError:
|
|
results[percentile] = float('nan')
|
|
return results
|
|
|
|
|
|
def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
"""
|
|
Compute and return quantiles for various metrics from the database results.
|
|
|
|
:param result_db_path: Path to the SQLite database file.
|
|
:return: Dictionary of percentiles for various metrics.
|
|
"""
|
|
|
|
def inter_token_latencies(chunk_times_json: str) -> List[float]:
|
|
try:
|
|
chunk_times = json.loads(chunk_times_json)
|
|
return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
|
|
except (json.JSONDecodeError, TypeError) as e:
|
|
logger.error(f'Error parsing chunk times: {e}')
|
|
return []
|
|
|
|
query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
|
|
'n_chunks, chunk_time, prompt_tokens, completion_tokens '
|
|
'FROM result WHERE success=1')
|
|
|
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
|
|
with sqlite3.connect(result_db_path) as con:
|
|
rows = con.execute(query_sql).fetchall()
|
|
|
|
# Define index variables for columns
|
|
CHUNK_TIMES_INDEX = 1
|
|
LATENCY_INDEX = 4
|
|
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
CHUNK_TIME_INDEX = 7
|
|
PROMPT_TOKENS_INDEX = 8
|
|
COMPLETION_TOKENS_INDEX = 9
|
|
|
|
# Prepare data for each metric
|
|
inter_token_latencies_all = []
|
|
for row in rows:
|
|
inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
|
|
|
|
metrics = {
|
|
PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
PercentileMetrics.ITL:
|
|
inter_token_latencies_all,
|
|
PercentileMetrics.TPOT:
|
|
[(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
|
|
for row in rows],
|
|
PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
|
|
PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
PercentileMetrics.OUTPUT_THROUGHPUT:
|
|
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
for row in rows],
|
|
PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
|
|
/ row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
for row in rows]
|
|
}
|
|
|
|
# Calculate percentiles for each metric
|
|
results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
|
|
for metric_name, data in metrics.items():
|
|
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
|
|
return results
|
|
|
|
|
|
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
|
|
result_path = os.path.dirname(result_db_path)
|
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
|
|
metrics_result = metrics.create_message()
|
|
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
|
|
# Print summary in a table
|
|
table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
|
|
logger.info('\nBenchmarking summary:\n' + table)
|
|
|
|
# Get percentile results
|
|
percentile_result = get_percentile_results(result_db_path)
|
|
if percentile_result:
|
|
write_json_file(transpose_results(percentile_result), os.path.join(result_path, 'benchmark_percentile.json'))
|
|
# Print percentile results in a table
|
|
table = tabulate(percentile_result, headers='keys', tablefmt='pretty')
|
|
logger.info('\nPercentile results:\n' + table)
|
|
|
|
if args.dataset.startswith('speed_benchmark'):
|
|
speed_benchmark_result(result_db_path)
|
|
|
|
logger.info(f'Save the summary to: {result_path}')
|
|
|
|
return metrics_result, percentile_result
|
|
|
|
|
|
def speed_benchmark_result(result_db_path: str):
|
|
query_sql = """
|
|
SELECT
|
|
prompt_tokens,
|
|
ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
|
|
ROUND(AVG(max_gpu_memory_cost), 2)
|
|
FROM
|
|
result
|
|
WHERE
|
|
success = 1 AND latency > 0
|
|
GROUP BY
|
|
prompt_tokens
|
|
"""
|
|
|
|
with sqlite3.connect(result_db_path) as con:
|
|
cursor = con.cursor()
|
|
cursor.execute(query_sql)
|
|
rows = cursor.fetchall()
|
|
|
|
# Prepare data for tabulation
|
|
headers = ['Prompt Tokens', 'Speed(tokens/s)', 'GPU Memory(GB)']
|
|
data = [dict(zip(headers, row)) for row in rows]
|
|
|
|
# Print results in a table
|
|
table = tabulate(data, headers='keys', tablefmt='pretty')
|
|
logger.info('\nSpeed Benchmark Results:\n' + table)
|
|
|
|
# Write results to JSON file
|
|
result_path = os.path.dirname(result_db_path)
|
|
write_json_file(data, os.path.join(result_path, 'speed_benchmark.json'))
|