############################################################################### # Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126) ############################################################################### ARG CUDA_VERSION=12.6.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch ENV USE_CUDA=1 \ USE_DISTRIBUTED=1 \ USE_MPI=1 \ USE_GLOO=1 \ USE_NCCL=1 \ USE_SYSTEM_NCCL=1 \ BUILD_TEST=0 ARG MAX_JOBS=90 ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0" RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-dev python3-pip python3-distutils git cmake ninja-build \ libopenblas-dev libopenmpi-dev \ libnccl2=2.22.3-1+cuda12.6 \ libnccl-dev=2.22.3-1+cuda12.6 \ libjpeg-dev libpng-dev ca-certificates && \ python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy WORKDIR /opt RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git WORKDIR /opt/pytorch ENV MAX_JOBS=${MAX_JOBS} RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \ python3 setup.py bdist_wheel ############################################################################### # Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子 ############################################################################### ARG CUDA_VERSION=12.6.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip python3-distutils python3.10-dev git build-essential \ cmake ninja-build libjpeg-dev libpng-dev ca-certificates \ libopenmpi-dev libopenblas-dev\ libnccl2=2.22.3-1+cuda12.6 \ libnccl-dev=2.22.3-1+cuda12.6 && \ python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools # ── 安装自编 torch 轮子 ────────────────────────────────────────────────────── COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist RUN set -e && \ echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \ find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir # ── 编译 torchvision 0.22.1 (依赖本地 torch) ──────────────────────────────── WORKDIR /opt RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git WORKDIR /opt/vision RUN python3 setup.py bdist_wheel # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ───────────────────────── WORKDIR /opt RUN git clone https://github.com/flashinfer-ai/flashinfer.git WORKDIR /opt/flashinfer RUN pip install . && \ python3 -m pip wheel . --no-deps -w dist/ # # ── 安装 vllm(跳过编译,直接装) ───────────────────────────────────────────── # WORKDIR /opt # RUN pip install setuptools wheel setuptools_scm && \ # pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \ # python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps # ── 下载 vllm 预编译 wheel,避免编译 flash-attn ─────────────────────────────── WORKDIR /opt RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels # ── 编译你本地 sglang 源码并打 wheel ─────────────────────────────────────── COPY ./sglang /sgl/sglang WORKDIR /sgl/sglang/python RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \ python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels # ── sgl-kernel 的 Python 模块 ─────────────────────────────── RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel # ── 收集所有 wheel 到 /wheels ────────────────────────────────────────────── RUN mkdir -p /wheels && \ cp /tmp/torch_dist/torch*.whl /wheels/ && \ cp /opt/vision/dist/torchvision-*.whl /wheels/ && \ cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \ cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \ cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \ pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels # ── ✅ 再打包 runtime 阶段必需依赖 ──────────────────────────────────────────── RUN pip wheel \ pydantic orjson psutil pyzmq pynvml \ transformers==4.52.0 uvicorn fastapi IPython aiohttp \ setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \ -w /wheels ############################################################################### # Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel ############################################################################### ARG CUDA_VERSION=12.6.1 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\ python3 python3-dev python3-pip python3-distutils curl ca-certificates \ libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \ rm -rf /var/lib/apt/lists/* && \ python3 -m pip install --no-cache-dir --upgrade pip \ && python3 -m pip install --no-cache-dir xgrammar # 👉 拷贝 cupti 动态库(避免写死版本号) COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/ # 👇建议在后面补上 RUN ldconfig COPY --from=builder-extras /wheels /tmp/wheels COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels # ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖 RUN ls -lh /tmp/wheels && \ rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \ python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \ python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \ rm -rf /tmp/wheels # ✅ 安装 Prometheus client RUN python3 -m pip install --no-cache-dir prometheus_client # ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector) ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus # ✅ 确保目录存在 RUN mkdir -p /tmp/prometheus # ✅ 添加 Tini(推荐) ENV TINI_VERSION=v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini RUN chmod +x /tini ENTRYPOINT ["/tini", "--"] # ---- 拷贝模型(路径可换) ---- COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1 # ---- 暴露端口 ---- EXPOSE 30000 # ---- 启动 SGLang 推理服务 ---- CMD ["python3", "-m", "sglang.launch_server", \ "--host", "0.0.0.0", \ "--port", "30000", \ "--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \ "--tp", "2", \ "--api-key", "token-abc123", \ "--enable-metrics"]