evalscope_v0.17.0/evalscope.0.17.0/evalscope/perf/utils/db_util.py

import base64
import json
import os
import pickle
import re
import sqlite3
import sys
from datetime import datetime
from tabulate import tabulate
from typing import Dict, List, Tuple

from evalscope.perf.arguments import Arguments
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
from evalscope.utils.logger import get_logger

logger = get_logger()


def encode_data(data) -> str:
    """Encodes data using base64 and pickle."""
    return base64.b64encode(pickle.dumps(data)).decode('utf-8')


def write_json_file(data, output_path):
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)


def transpose_results(data):
    headers = data.keys()
    rows = zip(*data.values())

    return [dict(zip(headers, row)) for row in rows]


def create_result_table(cursor):
    cursor.execute('''CREATE TABLE IF NOT EXISTS result(
                      request TEXT,
                      start_time REAL,
                      chunk_times TEXT,
                      success INTEGER,
                      response_messages TEXT,
                      completed_time REAL,
                      latency REAL,
                      first_chunk_latency REAL,
                      n_chunks INTEGER,
                      chunk_time REAL,
                      prompt_tokens INTEGER,
                      completion_tokens INTEGER,
                      max_gpu_memory_cost REAL)''')


def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
    request = encode_data(benchmark_data.request)
    chunk_times = json.dumps(benchmark_data.chunk_times)
    response_messages = encode_data(benchmark_data.response_messages)

    # Columns common to both success and failure cases
    common_columns = (
        request,
        benchmark_data.start_time,
        chunk_times,
        benchmark_data.success,
        response_messages,
        benchmark_data.completed_time,
    )

    if benchmark_data.success:
        # Add additional columns for success case
        additional_columns = (
            benchmark_data.query_latency,
            benchmark_data.first_chunk_latency,
            benchmark_data.n_chunks,
            benchmark_data.n_chunks_time,
            benchmark_data.prompt_tokens,
            benchmark_data.completion_tokens,
            benchmark_data.max_gpu_memory_cost,
        )
        query = """INSERT INTO result(
                      request, start_time, chunk_times, success, response_messages,
                      completed_time, latency, first_chunk_latency,
                      n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
                   ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
        cursor.execute(query, common_columns + additional_columns)
    else:
        query = """INSERT INTO result(
                      request, start_time, chunk_times, success, response_messages, completed_time
                   ) VALUES (?, ?, ?, ?, ?, ?)"""
        cursor.execute(query, common_columns)


def get_output_path(args: Arguments) -> str:
    current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
    # Filter illegal characters
    output_path = re.sub(r'[<>:"|?*]', '_', output_path)
    if not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)
    logger.info(f'Save the result to: {output_path}')
    return output_path


def get_result_db_path(args: Arguments):
    result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')

    logger.info(f'Save the data base to: {result_db_path}')
    if os.path.exists(result_db_path):
        logger.warning('The db file exists, delete it and start again!.')
        sys.exit(1)

    return result_db_path


class PercentileMetrics:
    TTFT = 'TTFT (s)'
    ITL = 'ITL (s)'
    TPOT = 'TPOT (s)'
    LATENCY = 'Latency (s)'
    INPUT_TOKENS = 'Input tokens'
    OUTPUT_TOKENS = 'Output tokens'
    OUTPUT_THROUGHPUT = 'Output (tok/s)'
    TOTAL_THROUGHPUT = 'Total (tok/s)'
    PERCENTILES = 'Percentiles'


def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
    """
    Calculate the percentiles for a specific list of data.

    :param data: List of values for a specific metric.
    :param percentiles: List of percentiles to calculate.
    :return: Dictionary of calculated percentiles.
    """
    results = {}
    n_success_queries = len(data)
    data.sort()
    for percentile in percentiles:
        try:
            idx = int(n_success_queries * percentile / 100)
            value = data[idx] if data[idx] is not None else float('nan')
            results[percentile] = round(value, 4)
        except IndexError:
            results[percentile] = float('nan')
    return results


def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
    """
    Compute and return quantiles for various metrics from the database results.

    :param result_db_path: Path to the SQLite database file.
    :return: Dictionary of percentiles for various metrics.
    """

    def inter_token_latencies(chunk_times_json: str) -> List[float]:
        try:
            chunk_times = json.loads(chunk_times_json)
            return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
        except (json.JSONDecodeError, TypeError) as e:
            logger.error(f'Error parsing chunk times: {e}')
            return []

    query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
                 'n_chunks, chunk_time, prompt_tokens, completion_tokens '
                 'FROM result WHERE success=1')

    percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]

    with sqlite3.connect(result_db_path) as con:
        rows = con.execute(query_sql).fetchall()

    # Define index variables for columns
    CHUNK_TIMES_INDEX = 1
    LATENCY_INDEX = 4
    FIRST_CHUNK_LATENCY_INDEX = 5
    CHUNK_TIME_INDEX = 7
    PROMPT_TOKENS_INDEX = 8
    COMPLETION_TOKENS_INDEX = 9

    # Prepare data for each metric
    inter_token_latencies_all = []
    for row in rows:
        inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))

    metrics = {
        PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
        PercentileMetrics.ITL:
        inter_token_latencies_all,
        PercentileMetrics.TPOT:
        [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
         for row in rows],
        PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
        PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
        PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
        PercentileMetrics.OUTPUT_THROUGHPUT:
        [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
         for row in rows],
        PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
                                              / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
                                             for row in rows]
    }

    # Calculate percentiles for each metric
    results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
    for metric_name, data in metrics.items():
        metric_percentiles = calculate_percentiles(data, percentiles)
        results[metric_name] = [metric_percentiles[p] for p in percentiles]

    return results


def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
    result_path = os.path.dirname(result_db_path)
    write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))

    metrics_result = metrics.create_message()
    write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))

    # Print summary in a table
    table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
    logger.info('\nBenchmarking summary:\n' + table)

    # Get percentile results
    percentile_result = get_percentile_results(result_db_path)
    if percentile_result:
        write_json_file(transpose_results(percentile_result), os.path.join(result_path, 'benchmark_percentile.json'))
        # Print percentile results in a table
        table = tabulate(percentile_result, headers='keys', tablefmt='pretty')
        logger.info('\nPercentile results:\n' + table)

    if args.dataset.startswith('speed_benchmark'):
        speed_benchmark_result(result_db_path)

    logger.info(f'Save the summary to: {result_path}')

    return metrics_result, percentile_result


def speed_benchmark_result(result_db_path: str):
    query_sql = """
        SELECT
            prompt_tokens,
            ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
            ROUND(AVG(max_gpu_memory_cost), 2)
        FROM
            result
        WHERE
            success = 1 AND latency > 0
        GROUP BY
            prompt_tokens
    """

    with sqlite3.connect(result_db_path) as con:
        cursor = con.cursor()
        cursor.execute(query_sql)
        rows = cursor.fetchall()

    # Prepare data for tabulation
    headers = ['Prompt Tokens', 'Speed(tokens/s)', 'GPU Memory(GB)']
    data = [dict(zip(headers, row)) for row in rows]

    # Print results in a table
    table = tabulate(data, headers='keys', tablefmt='pretty')
    logger.info('\nSpeed Benchmark Results:\n' + table)

    # Write results to JSON file
    result_path = os.path.dirname(result_db_path)
    write_json_file(data, os.path.join(result_path, 'speed_benchmark.json'))