263 lines
7.5 KiB
Python
263 lines
7.5 KiB
Python
import logging
|
|
import os
|
|
import socket
|
|
import subprocess
|
|
import time
|
|
from types import SimpleNamespace
|
|
from typing import Optional
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
from sglang.test.run_eval import run_eval
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _find_available_port() -> int:
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.bind(("127.0.0.1", 0))
|
|
return s.getsockname()[1]
|
|
|
|
|
|
def _wait_health(url: str, timeout: float = 180.0) -> None:
|
|
start = time.perf_counter()
|
|
with requests.Session() as session:
|
|
while time.perf_counter() - start < timeout:
|
|
try:
|
|
r = session.get(f"{url}/health", timeout=5)
|
|
if r.status_code == 200:
|
|
return
|
|
except requests.RequestException:
|
|
pass
|
|
time.sleep(1)
|
|
raise TimeoutError(f"Service at {url} failed to become healthy in time")
|
|
|
|
|
|
def _detect_ib_device() -> Optional[str]:
|
|
"""Return first active IB device name (e.g., mlx5_0) or None if unavailable."""
|
|
# Fast check that ibv_devinfo exists
|
|
try:
|
|
subprocess.run(
|
|
["ibv_devinfo", "-l"],
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
timeout=1,
|
|
)
|
|
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
return None
|
|
|
|
for i in range(12):
|
|
dev = f"mlx5_{i}"
|
|
try:
|
|
res = subprocess.run(
|
|
["ibv_devinfo", dev],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=2,
|
|
)
|
|
if res.returncode == 0 and ("state:" in res.stdout):
|
|
for line in res.stdout.splitlines():
|
|
if "state:" in line and "PORT_ACTIVE" in line:
|
|
return dev
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _popen_launch_prefill_worker(
|
|
model: str,
|
|
bootstrap_port: int,
|
|
ib_device: Optional[str] = None,
|
|
base_gpu_id: int = 0,
|
|
) -> SimpleNamespace:
|
|
port = _find_available_port()
|
|
url = f"http://127.0.0.1:{port}"
|
|
cmd = [
|
|
"python3",
|
|
"-m",
|
|
"sglang.launch_server",
|
|
"--model-path",
|
|
model,
|
|
"--disaggregation-mode",
|
|
"prefill",
|
|
"--host",
|
|
"127.0.0.1",
|
|
"--port",
|
|
str(port),
|
|
"--disaggregation-bootstrap-port",
|
|
str(bootstrap_port),
|
|
"--base-gpu-id",
|
|
str(base_gpu_id),
|
|
]
|
|
if ib_device:
|
|
cmd += ["--disaggregation-ib-device", ib_device]
|
|
proc = subprocess.Popen(cmd)
|
|
_wait_health(url, timeout=300.0)
|
|
return SimpleNamespace(proc=proc, url=url, bootstrap_port=bootstrap_port)
|
|
|
|
|
|
def _popen_launch_decode_worker(
|
|
model: str, ib_device: Optional[str] = None, base_gpu_id: int = 0
|
|
) -> SimpleNamespace:
|
|
port = _find_available_port()
|
|
url = f"http://127.0.0.1:{port}"
|
|
cmd = [
|
|
"python3",
|
|
"-m",
|
|
"sglang.launch_server",
|
|
"--model-path",
|
|
model,
|
|
"--disaggregation-mode",
|
|
"decode",
|
|
"--host",
|
|
"127.0.0.1",
|
|
"--port",
|
|
str(port),
|
|
"--base-gpu-id",
|
|
str(base_gpu_id),
|
|
]
|
|
if ib_device:
|
|
cmd += ["--disaggregation-ib-device", ib_device]
|
|
proc = subprocess.Popen(cmd)
|
|
_wait_health(url, timeout=300.0)
|
|
return SimpleNamespace(proc=proc, url=url)
|
|
|
|
|
|
def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
|
|
if proc is None:
|
|
return
|
|
proc.terminate()
|
|
start = time.perf_counter()
|
|
while proc.poll() is None:
|
|
if time.perf_counter() - start > timeout:
|
|
proc.kill()
|
|
break
|
|
time.sleep(1)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def pd_cluster(e2e_model: str):
|
|
"""Start 2 prefill + 2 decode workers and one PD router, once per module."""
|
|
# Environment capability checks: require sgl_kernel and GPU backend
|
|
try:
|
|
import sgl_kernel # noqa: F401
|
|
except Exception as e: # pragma: no cover - environment dependent
|
|
pytest.fail(f"PD e2e requires sgl_kernel but it is not available: {e}")
|
|
|
|
try:
|
|
import torch # noqa: F401
|
|
except Exception as e: # pragma: no cover - environment dependent
|
|
pytest.fail(
|
|
f"PD e2e requires torch but it is not available or misconfigured: {e}"
|
|
)
|
|
|
|
if not torch.cuda.is_available(): # pragma: no cover - environment dependent
|
|
pytest.fail("PD e2e requires CUDA backend, but CUDA is not available")
|
|
|
|
workers: list[SimpleNamespace] = []
|
|
router_proc = None
|
|
try:
|
|
ib_device = _detect_ib_device()
|
|
|
|
# Launch 4 workers across 4 GPUs: prefill on 0,1 and decode on 2,3
|
|
pf1 = _popen_launch_prefill_worker(
|
|
e2e_model,
|
|
bootstrap_port=_find_available_port(),
|
|
ib_device=ib_device,
|
|
base_gpu_id=0,
|
|
)
|
|
pf2 = _popen_launch_prefill_worker(
|
|
e2e_model,
|
|
bootstrap_port=_find_available_port(),
|
|
ib_device=ib_device,
|
|
base_gpu_id=1,
|
|
)
|
|
dc1 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=2)
|
|
dc2 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=3)
|
|
prefills = [pf1, pf2]
|
|
decodes = [dc1, dc2]
|
|
workers.extend(prefills + decodes)
|
|
|
|
# PD router with two prefill and two decode endpoints
|
|
rport = _find_available_port()
|
|
router_url = f"http://127.0.0.1:{rport}"
|
|
pport = _find_available_port()
|
|
|
|
prefill = [(pf.url, pf.bootstrap_port) for pf in prefills]
|
|
decode = [dc.url for dc in decodes]
|
|
|
|
cmd = [
|
|
"python3",
|
|
"-m",
|
|
"sglang_router.launch_router",
|
|
"--host",
|
|
"127.0.0.1",
|
|
"--port",
|
|
str(rport),
|
|
"--policy",
|
|
"round_robin",
|
|
"--pd-disaggregation",
|
|
]
|
|
for url, bport in prefill:
|
|
cmd += ["--prefill", url, str(bport)]
|
|
for url in decode:
|
|
cmd += ["--decode", url]
|
|
cmd += [
|
|
"--prometheus-port",
|
|
str(pport),
|
|
"--prometheus-host",
|
|
"127.0.0.1",
|
|
]
|
|
|
|
router_proc = subprocess.Popen(cmd)
|
|
_wait_health(router_url, timeout=180.0)
|
|
|
|
yield SimpleNamespace(
|
|
router_url=router_url, workers=workers, router_proc=router_proc
|
|
)
|
|
finally:
|
|
if router_proc is not None:
|
|
_terminate(router_proc)
|
|
for w in workers:
|
|
_terminate(w.proc)
|
|
|
|
|
|
@pytest.mark.e2e
|
|
def test_pd_mmlu(e2e_model: str, pd_cluster):
|
|
"""
|
|
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run MMLU.
|
|
"""
|
|
args = SimpleNamespace(
|
|
base_url=pd_cluster.router_url,
|
|
model=e2e_model,
|
|
eval_name="mmlu",
|
|
num_examples=64,
|
|
num_threads=32,
|
|
temperature=0.1,
|
|
)
|
|
metrics = run_eval(args)
|
|
assert metrics["score"] >= 0.65
|
|
|
|
|
|
@pytest.mark.e2e
|
|
def test_pd_genai_bench(e2e_model: str, pd_cluster, genai_bench_runner):
|
|
"""
|
|
Launch 4 workers, start a PD router (2 prefill + 2 decode), then run a
|
|
short genai-bench benchmark and validate aggregate metrics.
|
|
"""
|
|
# Run genai-bench against the shared router
|
|
policy_label = "benchmark_round_robin_pd"
|
|
genai_bench_runner(
|
|
router_url=pd_cluster.router_url,
|
|
model_path=e2e_model,
|
|
experiment_folder=policy_label,
|
|
thresholds={
|
|
"ttft_mean_max": 12,
|
|
"e2e_latency_mean_max": 15,
|
|
"input_throughput_mean_min": 400,
|
|
"output_throughput_mean_min": 20,
|
|
},
|
|
kill_procs=pd_cluster.workers,
|
|
)
|