#!/bin/bash export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib rm -rf nohup.out && \ nohup python3 -m sglang.launch_server \ --attention-backend triton \ --model-path /code/models/Qwen3-32B/ \ --log-level info \ --tp 4 --mem-frac 0.25 \ --host 0.0.0.0 --port 33301 \ --enable-metrics --enable-cache-report \ --page-size 64 \ --enable-hierarchical-cache \ --hicache-ratio 2.5 --hicache-size 0 \ --hicache-io-backend kernel \ --hicache-mem-layout layer_first \ --hicache-write-policy write_through \ & ################################################## export CONFIG_PATH=/tmp/bench_mix_config.json # num_clients: Maximum number of concurrent client requests to be simulated # round_ratios: Distribution of requests across rounds. Given sum(round_ratios) total requests, # round_ratios[i] denotes the number of requests that will execute for (i+1) rounds echo '{ "num_rounds": 10, "num_clients": 60, "round_ratios": [50, 25, 15, 15, 10, 10, 9, 8, 7, 6], "mean_new_tokens_per_round": [1000, 400, 350, 300, 280, 260, 240, 220, 210, 200], "mean_return_tokens_per_round": [100, 100, 100, 100, 100, 100, 100, 100, 100, 100], "mean_inter_round_interval": [30, 30, 30, 30, 30, 30, 30, 30, 30, 30] }' > ${CONFIG_PATH} rm -rf bench_mix.out && \ nohup python3 /sgl-workspace/sglang/benchmark/hicache/bench_mix.py \ --model-path /code/models/Qwen3-32B/ \ --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \ --port 33301 \ --duration 600 \ > bench_mix.out &