sglang_v0.5.2/sglang/sgl-router/py_test/e2e/test_regular_router.py

170 lines
4.8 KiB
Python

import threading
import time
from types import SimpleNamespace
import pytest
import requests
from sglang.test.run_eval import run_eval
@pytest.mark.e2e
def test_mmlu(e2e_router_only_rr, e2e_two_workers_dp2, e2e_model):
# Attach two dp=2 workers (total 4 GPUs) to a fresh router-only instance
base = e2e_router_only_rr.url
for w in e2e_two_workers_dp2:
r = requests.post(f"{base}/add_worker", params={"url": w.url}, timeout=180)
r.raise_for_status()
args = SimpleNamespace(
base_url=base,
model=e2e_model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
temperature=0.1,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.65
@pytest.mark.e2e
def test_genai_bench(
e2e_router_only_rr, e2e_two_workers_dp2, e2e_model, genai_bench_runner
):
"""Attach a worker to the regular router and run a short genai-bench."""
base = e2e_router_only_rr.url
for w in e2e_two_workers_dp2:
r = requests.post(f"{base}/add_worker", params={"url": w.url}, timeout=180)
r.raise_for_status()
genai_bench_runner(
router_url=base,
model_path=e2e_model,
experiment_folder="benchmark_round_robin_regular",
thresholds={
"ttft_mean_max": 6,
"e2e_latency_mean_max": 14,
"input_throughput_mean_min": 1000,
"output_throughput_mean_min": 12,
},
kill_procs=e2e_two_workers_dp2,
)
@pytest.mark.e2e
def test_add_and_remove_worker_live(e2e_router_only_rr, e2e_primary_worker, e2e_model):
base = e2e_router_only_rr.url
worker_url = e2e_primary_worker.url
r = requests.post(f"{base}/add_worker", params={"url": worker_url}, timeout=180)
r.raise_for_status()
with requests.Session() as s:
for i in range(8):
r = s.post(
f"{base}/v1/completions",
json={
"model": e2e_model,
"prompt": f"x{i}",
"max_tokens": 1,
"stream": False,
},
timeout=120,
)
r.raise_for_status()
# Remove the worker
r = requests.post(f"{base}/remove_worker", params={"url": worker_url}, timeout=60)
r.raise_for_status()
@pytest.mark.e2e
def test_lazy_fault_tolerance_live(e2e_router_only_rr, e2e_primary_worker, e2e_model):
base = e2e_router_only_rr.url
worker = e2e_primary_worker
r = requests.post(f"{base}/add_worker", params={"url": worker.url}, timeout=180)
r.raise_for_status()
def killer():
time.sleep(10)
try:
worker.proc.terminate()
except Exception:
pass
t = threading.Thread(target=killer, daemon=True)
t.start()
args = SimpleNamespace(
base_url=base,
model=e2e_model,
eval_name="mmlu",
num_examples=32,
num_threads=16,
temperature=0.0,
)
metrics = run_eval(args)
assert 0.0 <= metrics["score"] <= 1.0
@pytest.mark.e2e
def test_dp_aware_worker_expansion_and_api_key(
e2e_model,
e2e_router_only_rr_dp_aware_api,
e2e_worker_dp2_api,
):
"""
Launch a router-only instance in dp_aware mode and a single worker with dp_size=2
and API key protection. Verify expansion, auth enforcement, and basic eval.
"""
import os
router_url = e2e_router_only_rr_dp_aware_api.url
worker_url = e2e_worker_dp2_api.url
api_key = e2e_router_only_rr_dp_aware_api.api_key
# Attach worker; router should expand to dp_size logical workers
r = requests.post(
f"{router_url}/add_worker", params={"url": worker_url}, timeout=180
)
r.raise_for_status()
r = requests.get(f"{router_url}/list_workers", timeout=30)
r.raise_for_status()
urls = r.json().get("urls", [])
assert len(urls) == 2
assert set(urls) == {f"{worker_url}@0", f"{worker_url}@1"}
# Verify API key enforcement path-through
# 1) Without Authorization -> 401 from backend
r = requests.post(
f"{router_url}/v1/completions",
json={"model": e2e_model, "prompt": "hi", "max_tokens": 1},
timeout=60,
)
assert r.status_code == 401
# 2) With correct Authorization -> 200
r = requests.post(
f"{router_url}/v1/completions",
json={"model": e2e_model, "prompt": "hi", "max_tokens": 1},
headers={"Authorization": f"Bearer {api_key}"},
timeout=60,
)
assert r.status_code == 200
# Finally, run MMLU eval through the router with auth
os.environ["OPENAI_API_KEY"] = api_key
args = SimpleNamespace(
base_url=router_url,
model=e2e_model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
temperature=0.1,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.65