2 changed files with 54 additions and 3 deletions
--- a/55
+++ b/55
@ -41,8 +41,6 @@ RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    export PYTORCH_BUILD_VERSION=2.8.0 PYTORCH_BUILD_NUMBER=1 && \
    python3 setup.py bdist_wheel

-
-
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
@ -188,6 +186,12 @@ RUN mkdir -p /wheels && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels

+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+# RUN pip wheel \
+#     pydantic orjson psutil pyzmq pynvml \
+#     transformers==4.56.0 uvicorn fastapi IPython aiohttp \
+#     setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+#     -w /wheels

 # ── ✅ 再打包 runtime 阶段必需依赖（本地优先；缺了再联网） ────────────────────────
 RUN bash -lc '\
@ -207,12 +211,59 @@ RUN pip wheel "gradio==5.38.2" requests -w /wheels
 RUN pip wheel pybase64==1.3.2 -w /wheels


+
+
 # 导出轮子的独立阶段
 FROM scratch AS wheelhouse
 COPY --from=builder-extras /wheels /



+
+
+# 从宿主机目录 _wheelhouse/ 安装轮子的 runtime
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime-prebuilt
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      gcc g++ build-essential ninja-build cuda-compiler-12-6 \
+      libcupti-dev cuda-cupti-12-6 \      
+      python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+      libopenblas-dev libgomp1 libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 \
+      libnccl2=2.22.3-1+cuda12.6 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip
+
+RUN ldconfig -p | grep -i cupti || (echo "no cupti"; exit 1)
+RUN ldconfig
+
+# ★ 从宿主机构建上下文复制本地轮子（目录名固定：_wheelhouse/）
+COPY _wheelhouse/ /tmp/wheels/
+
+# 安装顺序与 runtime-autobuild 完全一致（优先 torch，再装其它）
+RUN ls -lh /tmp/wheels || true && \
+    rm -f /tmp/wheels/huggingface_hub-0.34.4*.whl || true && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl || true && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl || true && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl || true && \
+    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*' -printf "/tmp/wheels/%f ") && \
+    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
+    rm -rf /tmp/wheels
+
+
+RUN python3 -m pip install --no-deps xgrammar==0.1.24
+
+RUN echo "/usr/local/cuda/extras/CUPTI/lib64" > /etc/ld.so.conf.d/cupti.conf && ldconfig
+# 保险起见，再加一行环境变量（有些基础镜像不把 extras 加入 ld.so.conf）：
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
+
+
+
+
+
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
--- a/supervisord.conf
+++ b/supervisord.conf
@ -5,7 +5,7 @@ logfile_maxbytes=0
 loglevel=info

 [program:sglang]
-command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --lora-target-modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj --max-lora-rank 32 --served-model-name qwen3-32b --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics --log-requests --log-requests-level 2
+command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --lora-target-modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj --max-lora-rank 16 --served-model-name qwen3-32b --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics --log-requests --log-requests-level 2
 autostart=true
 autorestart=true
 stdout_logfile=/dev/stdout