sglang_v0.5.2/Dockerfile

316 lines
14 KiB
Docker
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
RUN python3 -m pip install --no-cache-dir numpy requests packaging build
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9"
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
curl xz-utils \
&& python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir --no-deps
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone --recursive -b v0.3.1 https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
# 覆盖你的目标算力3090=8.64090=8.9H100=9.0a;可按需增/减
ENV FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9"
# 先做 AOT 预编译,再直接打 wheel不隔离使用同一份自编 torch
RUN python3 -m pip install --no-cache-dir numpy requests build "cuda-python>=12.0,<13" "nvidia-nvshmem-cu12" ninja pynvml filelock && \
python3 -m flashinfer.aot && \
python3 -m build --no-isolation --wheel && \
ls -lh dist/
COPY ./sglang /sgl/sglang
# # ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
# RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.3.9.post2 -d /tmp/sgl_kernel_wheels
ENV PATH=/usr/local/cuda/bin:${PATH}
# ── 用你本地源码编 sgl-kernel==0.3.9.post2与自编 torch 完全 ABI 对齐) ──────
WORKDIR /sgl/sglang/sgl-kernel
# 覆盖安装 ptxas 12.8(保留 nvcc 12.6),并打印版本确认
RUN bash -lc '\
set -euo pipefail; \
NVCC_ARCHIVE_VERSION=12.8.93; \
T=cuda_nvcc-linux-x86_64-${NVCC_ARCHIVE_VERSION}-archive; \
curl -fL --http1.1 -O https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/linux-x86_64/${T}.tar.xz && \
tar -xf ${T}.tar.xz && \
install -m 0755 ${T}/bin/ptxas /usr/local/cuda/bin/ptxas && \
/usr/local/cuda/bin/ptxas --version \
'
# 限制构建并行;避免 ptxas 多线程崩溃
ENV CMAKE_BUILD_PARALLEL_LEVEL=8
ENV SGL_KERNEL_COMPILE_THREADS=1
RUN bash -lc 'ls -la; test -f pyproject.toml -o -f setup.py || (echo "❌ no pyproject.toml/setup.py here; try sgl-kernel/python" && exit 1)'
# 构建 sgl-kernel保持 FA3去掉无效的关 90a 标志)
RUN python3 -m pip install --no-cache-dir "cmake>=3.27,<4.0" scikit-build-core==0.11.6 pybind11[global] packaging && \
bash -lc '\
export CMAKE_PREFIX_PATH="$(python3 -c "import torch; print(torch.utils.cmake_prefix_path)")" && \
export TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9" && \
export CUDAARCHS="80;86;89" && \
export CMAKE_CUDA_ARCHITECTURES="$CUDAARCHS" && \
# 这里保留常规参数;如果项目支持,也把内核编译线程设为 1未知项将被忽略不会报错
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=$CUDAARCHS -DSGL_KERNEL_COMPILE_THREADS=8 -Wno-dev" && \
python3 -m pip wheel . --no-deps --no-build-isolation -w /tmp/sgl_kernel_wheels \
'
# ★ 构建期 constraints把自编的 torch / sgl-kernel / flashinfer 都锁到本地 wheel
RUN bash -lc '\
set -euo pipefail; \
TWHL=$(ls /tmp/torch_dist/torch-*.whl | head -n1); \
SKWHL=$(ls /tmp/sgl_kernel_wheels/sgl_kernel-*.whl | head -n1); \
FWHL=$(ls /opt/flashinfer/dist/flashinfer_python-*.whl 2>/dev/null | head -n1 || true); \
: > /tmp/local_constraints_build.txt; \
echo "torch @ file://$TWHL" >> /tmp/local_constraints_build.txt; \
echo "sgl-kernel @ file://$SKWHL" >> /tmp/local_constraints_build.txt; \
if [ -n "$FWHL" ]; then \
echo "flashinfer-python @ file://$FWHL" >> /tmp/local_constraints_build.txt; \
fi; \
echo ">>> build-time constraints:"; cat /tmp/local_constraints_build.txt \
'
RUN python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheels/sgl_kernel-*.whl
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
WORKDIR /sgl/sglang/python
RUN python3 -m pip install --no-build-isolation -c /tmp/local_constraints_build.txt ".[srt,openai]" && \
python3 -m pip wheel --no-build-isolation -c /tmp/local_constraints_build.txt ".[srt,openai]" -w /tmp/sg_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.56.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# 产出 openai-harmony 的离线 wheel
RUN pip wheel --no-deps openai-harmony==0.0.4 -w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
# 把运行时所需依赖也打包进入wheel ────────────────────────────────────────────────
RUN pip wheel pybase64==1.3.2 -w /wheels
# 导出轮子的独立阶段
FROM scratch AS wheelhouse
COPY --from=builder-extras /wheels /
# 从宿主机目录 _wheelhouse/ 安装轮子的 runtime
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime-prebuilt
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc g++ build-essential ninja-build cuda-compiler-12-6 \
libcupti-dev cuda-cupti-12-6 \
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 \
libnccl2=2.22.3-1+cuda12.6 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip
RUN ldconfig -p | grep -i cupti || (echo "no cupti"; exit 1)
RUN ldconfig
# ★ 从宿主机构建上下文复制本地轮子目录名固定_wheelhouse/
COPY _wheelhouse/ /tmp/wheels/
# 安装顺序与 runtime-autobuild 完全一致(优先 torch再装其它
RUN ls -lh /tmp/wheels || true && \
# rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.34.4*.whl || true && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl || true && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl || true && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl || true && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*' -printf "/tmp/wheels/%f ") && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
RUN python3 -m pip install --no-deps xgrammar==0.1.24
RUN echo "/usr/local/cuda/extras/CUPTI/lib64" > /etc/ld.so.conf.d/cupti.conf && ldconfig
# 保险起见,再加一行环境变量(有些基础镜像不把 extras 加入 ld.so.conf
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime-autobuild
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libcupti-dev cuda-cupti-12-6 \
libopenblas-dev libgomp1 libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 libnccl2=2.22.3-1+cuda12.6 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip
# 检查 cupti 动态库
RUN ldconfig -p | grep -i cupti || (echo "no cupti"; exit 1)
# 👇建议在后面补上
RUN ldconfig
COPY _wheelhouse/ /tmp/wheels/
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
# rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.34.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
# python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
RUN python3 -m pip install --no-deps xgrammar==0.1.24
RUN echo "/usr/local/cuda/extras/CUPTI/lib64" > /etc/ld.so.conf.d/cupti.conf && ldconfig
# 保险起见,再加一行环境变量(有些基础镜像不把 extras 加入 ld.so.conf
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]