sglang_v0.5.2/sglang/sgl-router/py_test/e2e/test_pd_router.py

263 lines
7.5 KiB
Python

import logging
import os
import socket
import subprocess
import time
from types import SimpleNamespace
from typing import Optional
import pytest
import requests
from sglang.test.run_eval import run_eval
logger = logging.getLogger(__name__)
def _find_available_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("127.0.0.1", 0))
return s.getsockname()[1]
def _wait_health(url: str, timeout: float = 180.0) -> None:
start = time.perf_counter()
with requests.Session() as session:
while time.perf_counter() - start < timeout:
try:
r = session.get(f"{url}/health", timeout=5)
if r.status_code == 200:
return
except requests.RequestException:
pass
time.sleep(1)
raise TimeoutError(f"Service at {url} failed to become healthy in time")
def _detect_ib_device() -> Optional[str]:
"""Return first active IB device name (e.g., mlx5_0) or None if unavailable."""
# Fast check that ibv_devinfo exists
try:
subprocess.run(
["ibv_devinfo", "-l"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=1,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
return None
for i in range(12):
dev = f"mlx5_{i}"
try:
res = subprocess.run(
["ibv_devinfo", dev],
capture_output=True,
text=True,
timeout=2,
)
if res.returncode == 0 and ("state:" in res.stdout):
for line in res.stdout.splitlines():
if "state:" in line and "PORT_ACTIVE" in line:
return dev
except Exception:
pass
return None
def _popen_launch_prefill_worker(
model: str,
bootstrap_port: int,
ib_device: Optional[str] = None,
base_gpu_id: int = 0,
) -> SimpleNamespace:
port = _find_available_port()
url = f"http://127.0.0.1:{port}"
cmd = [
"python3",
"-m",
"sglang.launch_server",
"--model-path",
model,
"--disaggregation-mode",
"prefill",
"--host",
"127.0.0.1",
"--port",
str(port),
"--disaggregation-bootstrap-port",
str(bootstrap_port),
"--base-gpu-id",
str(base_gpu_id),
]
if ib_device:
cmd += ["--disaggregation-ib-device", ib_device]
proc = subprocess.Popen(cmd)
_wait_health(url, timeout=300.0)
return SimpleNamespace(proc=proc, url=url, bootstrap_port=bootstrap_port)
def _popen_launch_decode_worker(
model: str, ib_device: Optional[str] = None, base_gpu_id: int = 0
) -> SimpleNamespace:
port = _find_available_port()
url = f"http://127.0.0.1:{port}"
cmd = [
"python3",
"-m",
"sglang.launch_server",
"--model-path",
model,
"--disaggregation-mode",
"decode",
"--host",
"127.0.0.1",
"--port",
str(port),
"--base-gpu-id",
str(base_gpu_id),
]
if ib_device:
cmd += ["--disaggregation-ib-device", ib_device]
proc = subprocess.Popen(cmd)
_wait_health(url, timeout=300.0)
return SimpleNamespace(proc=proc, url=url)
def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
if proc is None:
return
proc.terminate()
start = time.perf_counter()
while proc.poll() is None:
if time.perf_counter() - start > timeout:
proc.kill()
break
time.sleep(1)
@pytest.fixture(scope="module")
def pd_cluster(e2e_model: str):
"""Start 2 prefill + 2 decode workers and one PD router, once per module."""
# Environment capability checks: require sgl_kernel and GPU backend
try:
import sgl_kernel # noqa: F401
except Exception as e: # pragma: no cover - environment dependent
pytest.fail(f"PD e2e requires sgl_kernel but it is not available: {e}")
try:
import torch # noqa: F401
except Exception as e: # pragma: no cover - environment dependent
pytest.fail(
f"PD e2e requires torch but it is not available or misconfigured: {e}"
)
if not torch.cuda.is_available(): # pragma: no cover - environment dependent
pytest.fail("PD e2e requires CUDA backend, but CUDA is not available")
workers: list[SimpleNamespace] = []
router_proc = None
try:
ib_device = _detect_ib_device()
# Launch 4 workers across 4 GPUs: prefill on 0,1 and decode on 2,3
pf1 = _popen_launch_prefill_worker(
e2e_model,
bootstrap_port=_find_available_port(),
ib_device=ib_device,
base_gpu_id=0,
)
pf2 = _popen_launch_prefill_worker(
e2e_model,
bootstrap_port=_find_available_port(),
ib_device=ib_device,
base_gpu_id=1,
)
dc1 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=2)
dc2 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=3)
prefills = [pf1, pf2]
decodes = [dc1, dc2]
workers.extend(prefills + decodes)
# PD router with two prefill and two decode endpoints
rport = _find_available_port()
router_url = f"http://127.0.0.1:{rport}"
pport = _find_available_port()
prefill = [(pf.url, pf.bootstrap_port) for pf in prefills]
decode = [dc.url for dc in decodes]
cmd = [
"python3",
"-m",
"sglang_router.launch_router",
"--host",
"127.0.0.1",
"--port",
str(rport),
"--policy",
"round_robin",
"--pd-disaggregation",
]
for url, bport in prefill:
cmd += ["--prefill", url, str(bport)]
for url in decode:
cmd += ["--decode", url]
cmd += [
"--prometheus-port",
str(pport),
"--prometheus-host",
"127.0.0.1",
]
router_proc = subprocess.Popen(cmd)
_wait_health(router_url, timeout=180.0)
yield SimpleNamespace(
router_url=router_url, workers=workers, router_proc=router_proc
)
finally:
if router_proc is not None:
_terminate(router_proc)
for w in workers:
_terminate(w.proc)
@pytest.mark.e2e
def test_pd_mmlu(e2e_model: str, pd_cluster):
"""
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run MMLU.
"""
args = SimpleNamespace(
base_url=pd_cluster.router_url,
model=e2e_model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
temperature=0.1,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.65
@pytest.mark.e2e
def test_pd_genai_bench(e2e_model: str, pd_cluster, genai_bench_runner):
"""
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run a
short genai-bench benchmark and validate aggregate metrics.
"""
# Run genai-bench against the shared router
policy_label = "benchmark_round_robin_pd"
genai_bench_runner(
router_url=pd_cluster.router_url,
model_path=e2e_model,
experiment_folder=policy_label,
thresholds={
"ttft_mean_max": 12,
"e2e_latency_mean_max": 15,
"input_throughput_mean_min": 400,
"output_throughput_mean_min": 20,
},
kill_procs=pd_cluster.workers,
)