############################################################################### # Stage 0 ─ builder-torch:编译 PyTorch 2.8.0 (+cu126) ############################################################################### ARG CUDA_VERSION=12.6.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch ENV USE_CUDA=1 \ USE_DISTRIBUTED=1 \ USE_MPI=1 \ USE_GLOO=1 \ USE_NCCL=1 \ USE_SYSTEM_NCCL=1 \ BUILD_TEST=0 ARG MAX_JOBS=90 ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9" RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-dev python3-pip python3-distutils git cmake ninja-build \ libopenblas-dev libopenmpi-dev \ libnccl2=2.22.3-1+cuda12.6 \ libnccl-dev=2.22.3-1+cuda12.6 \ libjpeg-dev libpng-dev ca-certificates && \ python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy RUN python3 -m pip install --no-cache-dir numpy requests packaging build # 在 PyTorch 要求cmake >=3.27,ubuntu 22.04默认是cmake 3.22.1,所以现在需要安装新的: RUN python3 -m pip install --no-cache-dir "cmake>=3.29,<4.0" "ninja>=1.11" && \ cmake --version && ninja --version WORKDIR /opt # RUN git clone --recursive -b v2.8.0 https://github.com/pytorch/pytorch.git COPY ./pytorch_2.8.0/ /opt/pytorch WORKDIR /opt/pytorch ENV MAX_JOBS=${MAX_JOBS} RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \ export PYTORCH_BUILD_VERSION=2.8.0 PYTORCH_BUILD_NUMBER=1 && \ python3 setup.py bdist_wheel ############################################################################### # Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子 ############################################################################### ARG CUDA_VERSION=12.6.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras ENV TORCH_CUDA_ARCH_LIST=8.0,8.6,8.9 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip python3-distutils python3.10-dev git build-essential \ cmake ninja-build libjpeg-dev libpng-dev ca-certificates \ libopenmpi-dev libopenblas-dev\ libnccl2=2.22.3-1+cuda12.6 \ libnccl-dev=2.22.3-1+cuda12.6 \ curl xz-utils \ && python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools # for torch vision以及sglang 0.5.2: RUN python3 -m pip install --no-cache-dir "cmake>=3.29,<4.0" "ninja>=1.11" && \ cmake --version && ninja --version # ── 安装自编 torch 轮子 ────────────────────────────────────────────────────── COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist RUN set -e && \ echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \ find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir --no-deps && \ # 立刻补齐 torch 运行时依赖(重点!) python3 -m pip install --no-cache-dir \ "typing-extensions>=4.10.0" "sympy>=1.13.3" jinja2 fsspec networkx filelock RUN python3 -c "import torch, typing_extensions, sympy, jinja2, fsspec, networkx; print('✅ Torch:', torch.__version__)" # ── 编译 torchvision 0.23.0 (依赖本地 torch) ──────────────────────────────── WORKDIR /opt # RUN git clone -b v0.23.0 https://github.com/pytorch/vision.git COPY ./vision_0.23.0/ /opt/vision WORKDIR /opt/vision RUN python3 setup.py bdist_wheel && \ pip install --no-cache-dir --no-deps dist/torchvision-*.whl # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ───────────────────────── WORKDIR /opt # RUN git clone --recursive -b v0.3.1 https://github.com/flashinfer-ai/flashinfer.git COPY ./flashinfer_0.3.1/ /opt/flashinfer WORKDIR /opt/flashinfer # 覆盖你的目标算力:3090=8.6,4090=8.9,H100=9.0a;可按需增/减 ENV FLASHINFER_CUDA_ARCH_LIST="8.0 8.6 8.9" # 先做 AOT 预编译,再直接打 wheel(不隔离,使用同一份自编 torch) RUN python3 -m pip install --no-cache-dir numpy requests build "cuda-python>=12.0,<13" "nvidia-nvshmem-cu12" ninja pynvml && \ bash -lc 'unset TORCH_CUDA_ARCH_LIST; \ FLASHINFER_CUDA_ARCH_LIST="8.0 8.6 8.9" python3 -m flashinfer.aot' && \ python3 -m build --no-isolation --wheel && \ ls -lh dist/ && \ python3 -m pip install --no-cache-dir --no-deps dist/*.whl COPY ./sglang /sgl/sglang # # ── 🔄 下载 sgl-kernel(与 sglang 同步)─────────────────────────────────────── # RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.3.9.post2 -d /tmp/sgl_kernel_wheels ENV PATH=/usr/local/cuda/bin:${PATH} # ── 用你本地源码编 sgl-kernel==0.3.9.post2(与自编 torch 完全 ABI 对齐) ────── WORKDIR /sgl/sglang/sgl-kernel # 覆盖安装 ptxas 12.8(保留 nvcc 12.6),并打印版本确认 RUN bash -lc '\ set -euo pipefail; \ NVCC_ARCHIVE_VERSION=12.8.93; \ T=cuda_nvcc-linux-x86_64-${NVCC_ARCHIVE_VERSION}-archive; \ curl -fL --http1.1 -O https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/linux-x86_64/${T}.tar.xz && \ tar -xf ${T}.tar.xz && \ install -m 0755 ${T}/bin/ptxas /usr/local/cuda/bin/ptxas && \ /usr/local/cuda/bin/ptxas --version \ ' # 限制构建并行;避免 ptxas 多线程崩溃 ENV CMAKE_BUILD_PARALLEL_LEVEL=8 ENV SGL_KERNEL_COMPILE_THREADS=1 RUN bash -lc 'ls -la; test -f pyproject.toml -o -f setup.py || (echo "❌ no pyproject.toml/setup.py here; try sgl-kernel/python" && exit 1)' # 构建 sgl-kernel(保持 FA3;去掉无效的关 90a 标志) RUN python3 -m pip install --no-cache-dir "cmake>=3.27,<4.0" scikit-build-core==0.11.6 pybind11[global] packaging && \ bash -lc '\ export CMAKE_PREFIX_PATH="$(python3 -c "import torch; print(torch.utils.cmake_prefix_path)")" && \ export TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9" && \ export CUDAARCHS="80;86;89" && \ export CMAKE_CUDA_ARCHITECTURES="$CUDAARCHS" && \ # 这里保留常规参数;如果项目支持,也把内核编译线程设为 1(未知项将被忽略,不会报错) export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=$CUDAARCHS -DSGL_KERNEL_COMPILE_THREADS=8 -Wno-dev" && \ python3 -m pip wheel . --no-deps --no-build-isolation -w /tmp/sgl_kernel_wheels \ ' # ★ 构constraints:把自编的 torch,sgl-kernel,flashinfer都锁到本地 wheel,用于接下来的sglang 0.5.2打wheel RUN bash -lc '\ set -euo pipefail; \ TWHL=$(ls /tmp/torch_dist/torch-*.whl | head -n1); \ SKWHL=$(ls /tmp/sgl_kernel_wheels/sgl_kernel-*.whl | head -n1); \ FWHL=$(ls /opt/flashinfer/dist/flashinfer_python-*.whl 2>/dev/null | head -n1 || true); \ : > /tmp/local_constraints_build.txt; \ echo "torch @ file://$TWHL" >> /tmp/local_constraints_build.txt; \ echo "sgl-kernel @ file://$SKWHL" >> /tmp/local_constraints_build.txt; \ if [ -n "$FWHL" ]; then \ echo "flashinfer-python @ file://$FWHL" >> /tmp/local_constraints_build.txt; \ fi; \ echo ">>> build-time constraints:"; cat /tmp/local_constraints_build.txt \ ' #安装刚构建好的sgl_kernel到当前正在构建的docker image中 RUN python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheels/sgl_kernel-*.whl # ── 下载 vllm 预编译 wheel,避免编译 flash-attn ─────────────────────────────── WORKDIR /opt RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels # ── 编译你本地 sglang 源码并打 wheel ─────────────────────────────────────── WORKDIR /sgl/sglang/python RUN python3 -m pip install --no-build-isolation -c /tmp/local_constraints_build.txt ".[srt,openai]" && \ python3 -m pip wheel --no-build-isolation -c /tmp/local_constraints_build.txt ".[srt,openai]" -w /tmp/sg_wheels # ── 收集所有 wheel 到 /wheels ────────────────────────────────────────────── RUN mkdir -p /wheels && \ cp /tmp/torch_dist/torch*.whl /wheels/ && \ cp /opt/vision/dist/torchvision-*.whl /wheels/ && \ cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \ cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \ cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \ cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \ pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels # ── ✅ 再打包 runtime 阶段必需依赖 ──────────────────────────────────────────── RUN pip wheel \ pydantic orjson psutil pyzmq pynvml \ transformers==4.56.0 uvicorn fastapi IPython aiohttp \ setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \ -w /wheels # 产出 openai-harmony 的离线 wheel RUN pip wheel --no-deps openai-harmony==0.0.4 -w /wheels # ── ✅ 打包 gradio UI 所需依赖 ──────────────────────────────────────────────── RUN pip wheel "gradio==5.38.2" requests -w /wheels # 把运行时所需依赖也打包进入wheel ──────────────────────────────────────────────── RUN pip wheel pybase64==1.3.2 -w /wheels # 导出轮子的独立阶段 FROM scratch AS wheelhouse COPY --from=builder-extras /wheels / # 从宿主机目录 _wheelhouse/ 安装轮子的 runtime ARG CUDA_VERSION=12.6.1 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime-prebuilt ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ build-essential ninja-build cuda-compiler-12-6 \ libcupti-dev cuda-cupti-12-6 \ python3 python3-dev python3-pip python3-distutils curl ca-certificates \ libopenblas-dev libgomp1 libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 \ libnccl2=2.22.3-1+cuda12.6 && \ rm -rf /var/lib/apt/lists/* && \ python3 -m pip install --no-cache-dir --upgrade pip RUN ldconfig -p | grep -i cupti || (echo "no cupti"; exit 1) RUN ldconfig # ★ 从宿主机构建上下文复制本地轮子(目录名固定:_wheelhouse/) COPY _wheelhouse/ /tmp/wheels/ # 安装顺序与 runtime-autobuild 完全一致(优先 torch,再装其它) RUN ls -lh /tmp/wheels || true && \ rm -f /tmp/wheels/huggingface_hub-0.34.4*.whl || true && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl || true && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl || true && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl || true && \ python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*' -printf "/tmp/wheels/%f ") && \ python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \ rm -rf /tmp/wheels RUN python3 -m pip install --no-deps xgrammar==0.1.24 RUN echo "/usr/local/cuda/extras/CUPTI/lib64" > /etc/ld.so.conf.d/cupti.conf && ldconfig # 保险起见,再加一行环境变量(有些基础镜像不把 extras 加入 ld.so.conf): ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} ############################################################################### # Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel ############################################################################### ARG CUDA_VERSION=12.6.1 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime-autobuild ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\ python3 python3-dev python3-pip python3-distutils curl ca-certificates \ libcupti-dev cuda-cupti-12-6 \ libopenblas-dev libgomp1 libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 libnccl2=2.22.3-1+cuda12.6 && \ rm -rf /var/lib/apt/lists/* && \ python3 -m pip install --no-cache-dir --upgrade pip # 检查 cupti 动态库 RUN ldconfig -p | grep -i cupti || (echo "no cupti"; exit 1) # 👇建议在后面补上 RUN ldconfig COPY _wheelhouse/ /tmp/wheels/ # ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖 RUN ls -lh /tmp/wheels && \ rm -f /tmp/wheels/huggingface_hub-0.34.4*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \ python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \ python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \ rm -rf /tmp/wheels # ✅ 安装 Prometheus client RUN python3 -m pip install --no-cache-dir prometheus_client RUN python3 -m pip install --no-deps xgrammar==0.1.24 RUN echo "/usr/local/cuda/extras/CUPTI/lib64" > /etc/ld.so.conf.d/cupti.conf && ldconfig # 保险起见,再加一行环境变量(有些基础镜像不把 extras 加入 ld.so.conf): ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} # ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector) ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus # ✅ 确保目录存在 RUN mkdir -p /tmp/prometheus # ---- 拷贝预调优的 MoE Triton kernel config ---------------------------- COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs # ✅ 添加 Tini(推荐) ENV TINI_VERSION=v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini RUN chmod +x /tini ENTRYPOINT ["/tini", "--"] # ---- 拷贝模型(路径可换) ---- # COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1 # ---- 暴露端口 ---- EXPOSE 30000 30001 # 安装 supervisor RUN apt-get update && apt-get install -y supervisor && \ mkdir -p /etc/supervisor/conf.d # 拷贝 supervisord 配置文件和 UI 脚本 COPY ./meta_ui.py /app/meta_ui.py COPY ./supervisord.conf /etc/supervisor/supervisord.conf # 作为容器主进程运行 supervisor CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]