# the following code is largely adapted from https://github.com/lework/llm-benchmark import numpy as np from rich.console import Console from rich.panel import Panel from rich.style import Style from rich.table import Table from rich.text import Text from evalscope.utils.logger import get_logger from .benchmark_util import Metrics from .db_util import PercentileMetrics logger = get_logger() def analyze_results(all_results): """Analyze all test results and generate a summary report""" summary = [] total_tokens = 0 total_time = 0 for result in all_results: total_metrics = result[0] percentile_metrics = result[1] percentiles = percentile_metrics[PercentileMetrics.PERCENTILES] try: concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0) rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0) avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0) p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')] avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0) avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0) p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')] success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0) / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100 avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0) p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')] # Ensure all values are valid numbers if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]): logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped') continue summary.append([ concurrency, f'{rps:.2f}' if rps is not None else 'N/A', f'{avg_latency:.3f}' if avg_latency is not None else 'N/A', f'{p99_latency:.3f}' if p99_latency is not None else 'N/A', f'{avg_tps:.2f}' if avg_tps is not None else 'N/A', f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A', f'{success_rate:.1f}%' if success_rate is not None else 'N/A', f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A', f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A', f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A', ]) total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get( Metrics.SUCCEED_REQUESTS, 0) total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0) except Exception as e: logger.warning( f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}") continue if not summary: logger.warning('Error: No valid test result data') return [], 0, 0 return summary, total_tokens, total_time def print_summary(all_results, model_name): """Print test results summary""" summary, total_tokens, total_time = analyze_results(all_results) if not summary: logger.warning('No available test result data to display') return console = Console(width=100) # Set fixed width # Create title panel title = Text('Performance Test Summary Report', style='bold') console.print(Panel(title, width=60)) # Print basic information basic_info = Table(show_header=False, width=60) basic_info.add_column('Name', style='cyan', width=25) basic_info.add_column('Value', style='green', width=35) basic_info.add_row('Model', model_name) basic_info.add_row('Total Generated', f'{total_tokens:,} tokens') basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds') basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec') console.print('\nBasic Information:') console.print(basic_info) # Create detailed performance metrics table table = Table( title='Detailed Performance Metrics', show_header=True, header_style='bold cyan', border_style='blue', width=100, # Set total table width pad_edge=False, # Reduce edge padding min_width=60, # Minimum width ) # Add columns (set fixed column widths) table.add_column('Conc.', justify='right', style='cyan') table.add_column('RPS', justify='right') table.add_column('Avg Lat.(s)', justify='right') table.add_column('P99 Lat.(s)', justify='right') table.add_column('Gen. toks/s', justify='right') table.add_column('Avg TTFT(s)', justify='right') table.add_column('P99 TTFT(s)', justify='right') table.add_column('Avg TPOT(s)', justify='right') table.add_column('P99 TPOT(s)', justify='right') table.add_column('Success Rate', justify='right', style='green') # Add data rows for row in summary: try: # Set row style based on success rate success_rate = float(row[6].rstrip('%')) row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red' table.add_row( str(row[0]), # Concurrency f'{float(row[1]):.2f}', # RPS f'{float(row[2]):.3f}', # Average Latency f'{float(row[3]):.3f}', # P99 Latency f'{float(row[4]):.2f}', # Average TPS f'{float(row[5]):.3f}', # First Token Latency f'{float(row[7]):.3f}', # P99 TTFT f'{float(row[8]):.3f}', # Average TPOT f'{float(row[9]):.3f}', # P99 TPOT row[6], # Success Rate style=row_style) except ValueError as e: console.print(f'Warning: Error processing row data: {str(e)}', style='bold red') continue console.print('\n') console.print(table) # Calculate and display best performance configuration try: best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary]) best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary]) perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60) perf_info.add_column('Metric', style='cyan', width=20) perf_info.add_column('Value', style='green', width=40) perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)') perf_info.add_row('Lowest Latency', f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)') console.print('\n') console.print(perf_info) # Performance recommendations recommendations = [] if best_rps_idx == len(summary) - 1: recommendations.append( 'The system seems not to have reached its performance bottleneck, try higher concurrency') elif best_rps_idx == 0: recommendations.append('Consider lowering concurrency, current load may be too high') else: recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}') success_rate = float(summary[-1][6][:-1]) if success_rate < 95: recommendations.append( 'Success rate is low at high concurrency, check system resources or reduce concurrency') recommend_text = Text('\nPerformance Recommendations:', style='bold cyan') console.print(recommend_text) for rec in recommendations: console.print(f'• {rec}', style='yellow') except Exception as e: console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')