evalscope_v0.17.0/evalscope.0.17.0/evalscope/perf/utils/rich_display.py

187 lines
7.9 KiB
Python

# the following code is largely adapted from https://github.com/lework/llm-benchmark
import numpy as np
from rich.console import Console
from rich.panel import Panel
from rich.style import Style
from rich.table import Table
from rich.text import Text
from evalscope.utils.logger import get_logger
from .benchmark_util import Metrics
from .db_util import PercentileMetrics
logger = get_logger()
def analyze_results(all_results):
"""Analyze all test results and generate a summary report"""
summary = []
total_tokens = 0
total_time = 0
for result in all_results:
total_metrics = result[0]
percentile_metrics = result[1]
percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
try:
concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
/ total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
# Ensure all values are valid numbers
if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
continue
summary.append([
concurrency,
f'{rps:.2f}' if rps is not None else 'N/A',
f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
])
total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
Metrics.SUCCEED_REQUESTS, 0)
total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
except Exception as e:
logger.warning(
f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
continue
if not summary:
logger.warning('Error: No valid test result data')
return [], 0, 0
return summary, total_tokens, total_time
def print_summary(all_results, model_name):
"""Print test results summary"""
summary, total_tokens, total_time = analyze_results(all_results)
if not summary:
logger.warning('No available test result data to display')
return
console = Console(width=100) # Set fixed width
# Create title panel
title = Text('Performance Test Summary Report', style='bold')
console.print(Panel(title, width=60))
# Print basic information
basic_info = Table(show_header=False, width=60)
basic_info.add_column('Name', style='cyan', width=25)
basic_info.add_column('Value', style='green', width=35)
basic_info.add_row('Model', model_name)
basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
console.print('\nBasic Information:')
console.print(basic_info)
# Create detailed performance metrics table
table = Table(
title='Detailed Performance Metrics',
show_header=True,
header_style='bold cyan',
border_style='blue',
width=100, # Set total table width
pad_edge=False, # Reduce edge padding
min_width=60, # Minimum width
)
# Add columns (set fixed column widths)
table.add_column('Conc.', justify='right', style='cyan')
table.add_column('RPS', justify='right')
table.add_column('Avg Lat.(s)', justify='right')
table.add_column('P99 Lat.(s)', justify='right')
table.add_column('Gen. toks/s', justify='right')
table.add_column('Avg TTFT(s)', justify='right')
table.add_column('P99 TTFT(s)', justify='right')
table.add_column('Avg TPOT(s)', justify='right')
table.add_column('P99 TPOT(s)', justify='right')
table.add_column('Success Rate', justify='right', style='green')
# Add data rows
for row in summary:
try:
# Set row style based on success rate
success_rate = float(row[6].rstrip('%'))
row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
table.add_row(
str(row[0]), # Concurrency
f'{float(row[1]):.2f}', # RPS
f'{float(row[2]):.3f}', # Average Latency
f'{float(row[3]):.3f}', # P99 Latency
f'{float(row[4]):.2f}', # Average TPS
f'{float(row[5]):.3f}', # First Token Latency
f'{float(row[7]):.3f}', # P99 TTFT
f'{float(row[8]):.3f}', # Average TPOT
f'{float(row[9]):.3f}', # P99 TPOT
row[6], # Success Rate
style=row_style)
except ValueError as e:
console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
continue
console.print('\n')
console.print(table)
# Calculate and display best performance configuration
try:
best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
perf_info.add_column('Metric', style='cyan', width=20)
perf_info.add_column('Value', style='green', width=40)
perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
perf_info.add_row('Lowest Latency',
f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
console.print('\n')
console.print(perf_info)
# Performance recommendations
recommendations = []
if best_rps_idx == len(summary) - 1:
recommendations.append(
'The system seems not to have reached its performance bottleneck, try higher concurrency')
elif best_rps_idx == 0:
recommendations.append('Consider lowering concurrency, current load may be too high')
else:
recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
success_rate = float(summary[-1][6][:-1])
if success_rate < 95:
recommendations.append(
'Success rate is low at high concurrency, check system resources or reduce concurrency')
recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
console.print(recommend_text)
for rec in recommendations:
console.print(f'{rec}', style='yellow')
except Exception as e:
console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')