sglang_v0.5.2/Dockerfile

###############################################################################
# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch

ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0

ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0"

RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy

RUN python3 -m pip install --no-cache-dir numpy requests packaging build

WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git

WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel

###############################################################################
# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras

ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9"
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8

RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      curl xz-utils \
    && python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools

# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir --no-deps


# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel


# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone  --recursive -b v0.3.1 https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer


# 覆盖你的目标算力：3090=8.6，4090=8.9，H100=9.0a；可按需增/减
ENV FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9"

# 先做 AOT 预编译，再直接打 wheel（不隔离，使用同一份自编 torch）
RUN python3 -m pip install --no-cache-dir numpy requests build "cuda-python>=12.0,<13" "nvidia-nvshmem-cu12" ninja pynvml filelock && \
    python3 -m flashinfer.aot && \
    python3 -m build --no-isolation --wheel && \
    ls -lh dist/

COPY ./sglang /sgl/sglang

# # ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
# RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.3.9.post2 -d /tmp/sgl_kernel_wheels

ENV PATH=/usr/local/cuda/bin:${PATH}

# ── 用你本地源码编 sgl-kernel==0.3.9.post2（与自编 torch 完全 ABI 对齐） ──────
WORKDIR /sgl/sglang/sgl-kernel

# 覆盖安装 ptxas 12.8（保留 nvcc 12.6），并打印版本确认
RUN bash -lc '\
  set -euo pipefail; \
  NVCC_ARCHIVE_VERSION=12.8.93; \
  T=cuda_nvcc-linux-x86_64-${NVCC_ARCHIVE_VERSION}-archive; \
  curl -fL --http1.1 -O https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/linux-x86_64/${T}.tar.xz && \
  tar -xf ${T}.tar.xz && \
  install -m 0755 ${T}/bin/ptxas /usr/local/cuda/bin/ptxas && \
  /usr/local/cuda/bin/ptxas --version \
'

# 限制构建并行；避免 ptxas 多线程崩溃
ENV CMAKE_BUILD_PARALLEL_LEVEL=8
ENV SGL_KERNEL_COMPILE_THREADS=1

RUN bash -lc 'ls -la; test -f pyproject.toml -o -f setup.py || (echo "❌ no pyproject.toml/setup.py here; try sgl-kernel/python" && exit 1)'

# 构建 sgl-kernel（保持 FA3；去掉无效的关 90a 标志）
RUN python3 -m pip install --no-cache-dir "cmake>=3.27,<4.0" scikit-build-core==0.11.6 pybind11[global] packaging && \
    bash -lc '\
      export CMAKE_PREFIX_PATH="$(python3 -c "import torch; print(torch.utils.cmake_prefix_path)")" && \
      export TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9" && \
      export CUDAARCHS="80;86;89" && \
      export CMAKE_CUDA_ARCHITECTURES="$CUDAARCHS" && \
      # 这里保留常规参数；如果项目支持，也把内核编译线程设为 1（未知项将被忽略，不会报错）
      export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=$CUDAARCHS -DSGL_KERNEL_COMPILE_THREADS=8 -Wno-dev" && \
      python3 -m pip wheel . --no-deps --no-build-isolation -w /tmp/sgl_kernel_wheels \
    '


# ★ 构建期 constraints：把自编的 torch / sgl-kernel / flashinfer 都锁到本地 wheel
RUN bash -lc '\
  set -euo pipefail; \
  TWHL=$(ls /tmp/torch_dist/torch-*.whl | head -n1); \
  SKWHL=$(ls /tmp/sgl_kernel_wheels/sgl_kernel-*.whl | head -n1); \
  FWHL=$(ls /opt/flashinfer/dist/flashinfer_python-*.whl 2>/dev/null | head -n1 || true); \
  : > /tmp/local_constraints_build.txt; \
  echo "torch @ file://$TWHL"            >> /tmp/local_constraints_build.txt; \
  echo "sgl-kernel @ file://$SKWHL"      >> /tmp/local_constraints_build.txt; \
  if [ -n "$FWHL" ]; then \
    echo "flashinfer-python @ file://$FWHL" >> /tmp/local_constraints_build.txt; \
  fi; \
  echo ">>> build-time constraints:"; cat /tmp/local_constraints_build.txt \
'

RUN python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheels/sgl_kernel-*.whl


# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels

# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
WORKDIR /sgl/sglang/python
RUN python3 -m pip install --no-build-isolation -c /tmp/local_constraints_build.txt ".[srt,openai]" && \
    python3 -m pip wheel  --no-build-isolation -c /tmp/local_constraints_build.txt ".[srt,openai]" -w /tmp/sg_wheels


# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels

# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.56.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels


# 产出 openai-harmony 的离线 wheel
RUN pip wheel --no-deps openai-harmony==0.0.4 -w /wheels

# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels

# 把运行时所需依赖也打包进入wheel ────────────────────────────────────────────────
RUN pip wheel pybase64==1.3.2 -w /wheels


# 导出轮子的独立阶段
FROM scratch AS wheelhouse
COPY --from=builder-extras /wheels /


# 从宿主机目录 _wheelhouse/ 安装轮子的 runtime
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime-prebuilt

ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8

RUN apt-get update && apt-get install -y --no-install-recommends \
      gcc g++ build-essential ninja-build cuda-compiler-12-6 \
      libcupti-dev cuda-cupti-12-6 \
      python3 python3-dev python3-pip python3-distutils curl ca-certificates \
      libopenblas-dev libgomp1 libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 \
      libnccl2=2.22.3-1+cuda12.6 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip

RUN ldconfig -p | grep -i cupti || (echo "no cupti"; exit 1)
RUN ldconfig

# ★ 从宿主机构建上下文复制本地轮子（目录名固定：_wheelhouse/）
COPY _wheelhouse/ /tmp/wheels/

# 安装顺序与 runtime-autobuild 完全一致（优先 torch，再装其它）
RUN ls -lh /tmp/wheels || true && \
    # rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.34.4*.whl || true && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl || true && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl || true && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl || true && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*' -printf "/tmp/wheels/%f ") && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels


RUN python3 -m pip install --no-deps xgrammar==0.1.24

RUN echo "/usr/local/cuda/extras/CUPTI/lib64" > /etc/ld.so.conf.d/cupti.conf && ldconfig
# 保险起见，再加一行环境变量（有些基础镜像不把 extras 加入 ld.so.conf）：
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}


###############################################################################
# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime-autobuild

ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8

RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libcupti-dev cuda-cupti-12-6 \
        libopenblas-dev libgomp1 libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 libnccl2=2.22.3-1+cuda12.6 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip

# 检查 cupti 动态库
RUN ldconfig -p | grep -i cupti || (echo "no cupti"; exit 1)

# 👇建议在后面补上
RUN ldconfig

COPY _wheelhouse/ /tmp/wheels/

# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
    # rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.34.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
    # python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels


# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
RUN python3 -m pip install --no-deps xgrammar==0.1.24

RUN echo "/usr/local/cuda/extras/CUPTI/lib64" > /etc/ld.so.conf.d/cupti.conf && ldconfig
# 保险起见，再加一行环境变量（有些基础镜像不把 extras 加入 ld.so.conf）：
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}


# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus

# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus

# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs

# ✅ 添加 Tini（推荐）
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]

# ---- 拷贝模型（路径可换） ----
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B

HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1

# ---- 暴露端口 ----
EXPOSE 30000 30001

# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
    mkdir -p /etc/supervisor/conf.d

# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf

# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]