.

2025-09-14 19:12:27 +08:00 · 2025-09-14 18:46:44 +08:00 · 2025-09-14 18:39:52 +08:00 · 2025-09-14 18:07:18 +08:00 · 2025-09-03 10:33:08 +08:00 · 2025-09-03 10:01:21 +08:00
20 changed files with 2406 additions and 93 deletions
--- a/66
+++ b/66
@ -89,8 +89,9 @@ WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels

-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+
+# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels

 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
@ -99,6 +100,7 @@ RUN mkdir -p /wheels && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels

 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
@ -108,6 +110,9 @@ RUN pip wheel \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels

+# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
+RUN pip wheel "gradio==5.38.2" requests -w /wheels
+
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
@ -117,7 +122,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8

 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils ca-certificates \
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
@ -130,27 +135,35 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
 # 👇建议在后面补上
 RUN ldconfig

-COPY --from=builder-extras /wheels /tmp/wheels
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels

-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
+    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
+    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels

-# # 安装运行时漏掉的依赖
-# RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton

-# ✅ 离线安装全部依赖（包含所有运行时必需包）
-# RUN python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-#     python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
-#     rm -rf /tmp/wheels
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus

 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
@ -159,15 +172,20 @@ RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]

 # ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
+# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1

 # ---- 暴露端口 ----
-EXPOSE 30000
+EXPOSE 30000 30001

-# ---- 启动 SGLang 推理服务 ----
-CMD ["python3", "-m", "sglang.launch_server", \
-     "--host", "0.0.0.0", \
-     "--port", "30000", \
-     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
-     "--tp", "1", \
-     "--api-key", "token-abc123"]
+# 安装 supervisor
+RUN apt-get update && apt-get install -y supervisor && \
+    mkdir -p /etc/supervisor/conf.d
+
+# 拷贝 supervisord 配置文件和 UI 脚本
+COPY ./meta_ui.py /app/meta_ui.py
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
+
+# 作为容器主进程运行 supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.Qwen3-30B-A3B
+++ b/Dockerfile.Qwen3-30B-A3B
@ -0,0 +1,191 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+
+# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
+RUN pip wheel "gradio==5.38.2" requests -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels
+
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
+    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
+    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
+    rm -rf /tmp/wheels
+
+
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000 30001
+
+# 安装 supervisor
+RUN apt-get update && apt-get install -y supervisor && \
+    mkdir -p /etc/supervisor/conf.d
+
+# 拷贝 supervisord 配置文件和 UI 脚本
+COPY ./meta_ui.py /app/meta_ui.py
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
+
+# 作为容器主进程运行 supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.Qwen3-30B-A3B-Base
+++ b/Dockerfile.Qwen3-30B-A3B-Base
@ -0,0 +1,191 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+
+# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
+RUN pip wheel "gradio==5.38.2" requests -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels
+
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
+    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
+    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
+    rm -rf /tmp/wheels
+
+
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-30B-A3B-Base /root/.cradle/Alibaba/Qwen3-30B-A3B-Base
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000 30001
+
+# 安装 supervisor
+RUN apt-get update && apt-get install -y supervisor && \
+    mkdir -p /etc/supervisor/conf.d
+
+# 拷贝 supervisord 配置文件和 UI 脚本
+COPY ./meta_ui.py /app/meta_ui.py
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
+
+# 作为容器主进程运行 supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.bad
+++ b/Dockerfile.bad
@ -1,63 +0,0 @@
-############################################################
-#  Stage-0: 构建依赖轮子（PyTorch + SGLang + sgl_kernel）   #
-############################################################
-ARG CUDA_VERSION=12.8.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS builder
-
-# ---- Python 环境 ----
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends python3 python3-pip python3-distutils && \
-    ln -sf /usr/bin/python3 /usr/bin/python && \
-    python -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six
-
-# ---- PyTorch / torchvision / SGLang / sgl_kernel ----
-ARG TORCH_VER=2.7.1
-ARG TV_VER=0.22.1
-RUN case "$CUDA_VERSION" in \
-        12.6.1) CUINDEX=126 ;; \
-        12.8.1) CUINDEX=128 ;; \
-        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
-    esac && \
-    python -m pip install --no-cache-dir \
-        torch==${TORCH_VER}+cu${CUINDEX} \
-        torchvision==${TV_VER}+cu${CUINDEX} \
-        --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} && \
-    python -m pip install --no-cache-dir \
-        sglang==0.4.8.post1 \
-        sgl-kernel==0.0.2.post17 \
-        nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps && \
-    # ✅ 补全依赖（必须）
-    python -m pip install --no-cache-dir \
-        pydantic psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle orjson uvloop sentencepiece
-    # ✅ 测试模块完整性
-    #python -c "import sglang, torch, pydantic, transformers, sgl_kernel"
-
-############################################################
-#  Stage-1: 生成最小运行镜像                                #
-############################################################
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1
-
-# ---- Python runtime ----
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends python3 python3-distutils && \
-    ln -sf /usr/bin/python3 /usr/bin/python && \
-    rm -rf /var/lib/apt/lists/*
-
-# ---- 拷贝 Python 包和入口 ----
-COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
-COPY --from=builder /usr/local/bin /usr/local/bin
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
-
-# ---- 启动服务 ----
-EXPOSE 30000
-CMD ["python3", "-m", "sglang.launch_server", \
-     "--host", "0.0.0.0", \
-     "--port", "30000", \
-     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
-     "--tp", "1", \
-     "--api-key", "token-abc123"]
--- a/Dockerfile.ds_llama_70b
+++ b/Dockerfile.ds_llama_70b
@ -0,0 +1,177 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+COPY --from=builder-extras /wheels /tmp/wheels
+COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+
+#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    rm -rf /tmp/wheels
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000
+
+# ---- 启动 SGLang 推理服务 ----
+CMD ["python3", "-m", "sglang.launch_server", \
+     "--host", "0.0.0.0", \
+     "--port", "30000", \
+     "--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \
+     "--tp", "4", \
+     "--api-key", "token-abc123", \
+     "--enable-metrics"]
--- a/Dockerfile.llm_external
+++ b/Dockerfile.llm_external
@ -0,0 +1,191 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+
+# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
+RUN pip wheel "gradio==5.38.2" requests -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels
+
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
+    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
+    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
+    rm -rf /tmp/wheels
+
+
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000 30001
+
+# 安装 supervisor
+RUN apt-get update && apt-get install -y supervisor && \
+    mkdir -p /etc/supervisor/conf.d
+
+# 拷贝 supervisord 配置文件和 UI 脚本
+COPY ./meta_ui.py /app/meta_ui.py
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
+
+# 作为容器主进程运行 supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.qwen3-14b
+++ b/Dockerfile.qwen3-14b
@ -0,0 +1,177 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+COPY --from=builder-extras /wheels /tmp/wheels
+COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+
+#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    rm -rf /tmp/wheels
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-14B /root/.cradle/Alibaba/Qwen3-14B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000
+
+# ---- 启动 SGLang 推理服务 ----
+CMD ["python3", "-m", "sglang.launch_server", \
+     "--host", "0.0.0.0", \
+     "--port", "30000", \
+     "--model-path", "/root/.cradle/Alibaba/Qwen3-14B/", \
+     "--tp", "2", \
+     "--api-key", "token-abc123", \
+     "--enable-metrics"]
--- a/Dockerfile.qwen3-14b-base
+++ b/Dockerfile.qwen3-14b-base
@ -0,0 +1,183 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
+RUN pip wheel gradio requests -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+COPY --from=builder-extras /wheels /tmp/wheels
+COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+
+#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    rm -rf /tmp/wheels
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-14B-Base /root/.cradle/Alibaba/Qwen3-14B-Base
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000 30001
+
+# 安装 supervisor
+RUN apt-get update && apt-get install -y supervisor && \
+    mkdir -p /etc/supervisor/conf.d
+
+# 拷贝 supervisord 配置文件和 UI 脚本
+COPY ./meta_ui.py /app/meta_ui.py
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
+
+# 作为容器主进程运行 supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.qwen3-32b
+++ b/Dockerfile.qwen3-32b
@ -0,0 +1,177 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+COPY --from=builder-extras /wheels /tmp/wheels
+COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+
+#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    rm -rf /tmp/wheels
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-32B /root/.cradle/Alibaba/Qwen3-32B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000
+
+# ---- 启动 SGLang 推理服务 ----
+CMD ["python3", "-m", "sglang.launch_server", \
+     "--host", "0.0.0.0", \
+     "--port", "30000", \
+     "--model-path", "/root/.cradle/Alibaba/Qwen3-32B/", \
+     "--tp", "4", \
+     "--api-key", "token-abc123", \
+     "--enable-metrics"]
--- a/Dockerfile.qwen3-8b
+++ b/Dockerfile.qwen3-8b
@ -0,0 +1,177 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+COPY --from=builder-extras /wheels /tmp/wheels
+COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+
+#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    rm -rf /tmp/wheels
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000
+
+# ---- 启动 SGLang 推理服务 ----
+CMD ["python3", "-m", "sglang.launch_server", \
+     "--host", "0.0.0.0", \
+     "--port", "30000", \
+     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
+     "--tp", "1", \
+     "--api-key", "token-abc123", \
+     "--enable-metrics"]
--- a/Dockerfile.qwq32b
+++ b/Dockerfile.qwq32b
@ -0,0 +1,177 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+COPY --from=builder-extras /wheels /tmp/wheels
+COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+
+#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    rm -rf /tmp/wheels
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/QwQ-32B /root/.cradle/Alibaba/QwQ-32B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000
+
+# ---- 启动 SGLang 推理服务 ----
+CMD ["python3", "-m", "sglang.launch_server", \
+     "--host", "0.0.0.0", \
+     "--port", "30000", \
+     "--model-path", "/root/.cradle/Alibaba/QwQ-32B/", \
+     "--tp", "4", \
+     "--api-key", "token-abc123", \
+     "--enable-metrics"]
--- a/Dockerfile.tmp
+++ b/Dockerfile.tmp
@ -0,0 +1,191 @@
+###############################################################################
+# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
+
+ENV USE_CUDA=1 \
+    USE_DISTRIBUTED=1 \
+    USE_MPI=1 \
+    USE_GLOO=1 \
+    USE_NCCL=1 \
+    USE_SYSTEM_NCCL=1 \
+    BUILD_TEST=0
+
+ARG MAX_JOBS=90                       
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
+    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
+      libopenblas-dev libopenmpi-dev \
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 \
+      libjpeg-dev libpng-dev ca-certificates && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
+
+WORKDIR /opt
+RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
+
+WORKDIR /opt/pytorch
+ENV MAX_JOBS=${MAX_JOBS}
+RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
+    python3 setup.py bdist_wheel
+
+###############################################################################
+# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      python3 python3-pip python3-distutils python3.10-dev git build-essential \
+      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
+      libopenmpi-dev libopenblas-dev\
+      libnccl2=2.22.3-1+cuda12.6 \
+      libnccl-dev=2.22.3-1+cuda12.6 && \
+    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
+
+# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
+COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
+RUN set -e && \
+    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
+    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
+
+
+
+# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
+WORKDIR /opt
+RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
+WORKDIR /opt/vision
+RUN python3 setup.py bdist_wheel
+
+# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
+WORKDIR /opt
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git
+WORKDIR /opt/flashinfer
+
+RUN pip install . && \
+    python3 -m pip wheel . --no-deps -w dist/
+
+
+# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
+# WORKDIR /opt
+# RUN pip install setuptools wheel setuptools_scm && \
+#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
+#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
+
+# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
+WORKDIR /opt
+RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
+
+
+# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
+COPY ./sglang /sgl/sglang
+WORKDIR /sgl/sglang/python
+RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
+    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
+
+
+# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
+
+# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
+RUN mkdir -p /wheels && \
+    cp /tmp/torch_dist/torch*.whl /wheels/ && \
+    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
+    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
+    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
+    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
+    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
+    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
+
+# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
+RUN pip wheel \
+    pydantic orjson psutil pyzmq pynvml \
+    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
+    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
+    -w /wheels
+
+# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
+RUN pip wheel "gradio==5.38.2" requests -w /wheels
+
+###############################################################################
+# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
+###############################################################################
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
+    rm -rf /var/lib/apt/lists/* && \
+    python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir xgrammar
+
+# 👉 拷贝 cupti 动态库（避免写死版本号）
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
+COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
+
+# 👇建议在后面补上
+RUN ldconfig
+
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels
+
+# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
+RUN ls -lh /tmp/wheels && \
+    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
+    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
+    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
+    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
+    rm -rf /tmp/wheels
+
+
+
+# ✅ 安装 Prometheus client
+RUN python3 -m pip install --no-cache-dir prometheus_client
+
+# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
+ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
+
+# ✅ 确保目录存在
+RUN mkdir -p /tmp/prometheus
+
+# ✅ 添加 Tini（推荐）
+ENV TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--"]
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
+
+HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+
+# ---- 暴露端口 ----
+EXPOSE 30000 30001
+
+# 安装 supervisor
+RUN apt-get update && apt-get install -y supervisor && \
+    mkdir -p /etc/supervisor/conf.d
+
+# 拷贝 supervisord 配置文件和 UI 脚本
+COPY ./meta_ui.py /app/meta_ui.py
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
+
+# 作为容器主进程运行 supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/meta_ui.py
+++ b/meta_ui.py
@ -0,0 +1,224 @@
+import json, datetime, textwrap, requests, gradio as gr
+from pathlib import Path
+from collections import deque
+import queue, threading, time
+
+# ────────────────── 基础配置 ──────────────────
+API_KEY    = "token-abc123"
+MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
+
+
+def model_name(path: Path):
+    cfg = path / "config.json"
+    if cfg.exists():
+        data = json.load(cfg.open())
+        return data.get("architectures", [None])[0] or data.get("model_type") or path.name
+    return path.name
+
+MODEL_NAME = model_name(MODEL_PATH)
+now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
+
+# ────────────────── 日志队列 ──────────────────
+LOG_Q: "queue.Queue[str]" = queue.Queue()
+LOG_TXT = ""
+
+
+def log(msg):
+    print(msg, flush=True)
+    LOG_Q.put(msg)
+
+
+prev_log_value = ""
+
+def consume_logs(dummy=None):
+    global LOG_TXT, prev_log_value
+    buf = deque(LOG_TXT.splitlines(), maxlen=400)
+    while not LOG_Q.empty():
+        buf.append(LOG_Q.get())
+    LOG_TXT = "\n".join(buf)
+    if LOG_TXT != prev_log_value:
+        prev_log_value = LOG_TXT
+        return gr.update(value=LOG_TXT)
+    return gr.update()
+
+
+# ────────────────── 后端调用 ──────────────────
+def backend(text, sampling, api_suffix):
+    url = f"http://localhost:30000{api_suffix}"
+    if api_suffix == "/generate":
+        payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
+    elif api_suffix == "/v1/completions":
+        payload = {
+            "model": MODEL_NAME,
+            "prompt": text,
+            **sampling
+        }
+    elif api_suffix == "/v1/chat/completions":
+        payload = {
+            "model": MODEL_NAME,
+            "messages": text,  # ← 这里 text 实际是 messages list
+            **sampling
+        }
+        
+    log(f"\n🟡 [{now()}] POST {url}\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
+    try:
+        r = requests.post(url,
+                          headers={"Authorization": f"Bearer {API_KEY}",
+                                   "Content-Type": "application/json"},
+                          json=payload, timeout=180)
+        try:
+            data = r.json()
+        except Exception:
+            data = {}
+
+        if api_suffix == "/generate":
+            txt = data.get("text", "").strip()
+            meta = data.get("meta_info", {})
+            fr = meta.get("finish_reason")
+            ctok = meta.get("completion_tokens")
+        elif api_suffix == "/v1/completions":
+            choice = data.get("choices", [{}])[0]
+            txt = choice.get("text", "").strip()
+            fr = choice.get("finish_reason")
+            ctok = data.get("usage", {}).get("completion_tokens")
+        elif api_suffix == "/v1/chat/completions":
+            choice = data.get("choices", [{}])[0]
+            msg = choice.get("message", {})
+            txt = msg.get("content", "").strip()
+
+            # 新增：从 usage 获取 completion_tokens
+            ctok = data.get("usage", {}).get("completion_tokens")
+            fr = choice.get("finish_reason")  # 如果后续需要 finish reason
+
+        log(f"🟢 [{now()}] HTTP {r.status_code}  tokens={ctok}  finish={fr}\n"
+            f"🟢 resp={r.text!r}") 
+        if r.status_code != 200:
+            return f"[HTTP {r.status_code}] {r.text}"
+        return txt or "[⚠ 空]"
+    except Exception as e:
+        log(f"[❌ 请求异常] {e}")
+        return f"[❌ 请求异常] {e}"
+
+
+# ────────────────── Chat 回调 ──────────────────
+def chat(
+    user_msg, history,
+    max_new, temp, top_p, top_k,
+    rep_pen, pres_pen, stop_raw,
+    api_suffix, log_state
+):
+    from queue import Queue, Empty
+
+    user = user_msg["text"] if isinstance(user_msg, dict) and "text" in user_msg else user_msg
+
+    if api_suffix == "/v1/chat/completions":
+        # 给 LLM 的完整 history（用于上下文推理）
+        messages = history[:]  
+        messages.append({"role": "user", "content": user})
+        prompt_input = messages
+    else:
+        prompt_input = user
+
+    stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
+    samp = {
+        ("max_tokens" if api_suffix == "/v1/completions" else "max_new_tokens"): int(max_new),
+        "temperature": temp,
+        "top_p": top_p,
+        "top_k": int(top_k),
+        "repetition_penalty": rep_pen,
+        "presence_penalty": pres_pen,
+        **({"stop": stop} if stop else {})
+    }
+
+    result_q = Queue()
+
+    def worker():
+        out = backend(prompt_input, samp, api_suffix)
+        result_q.put(out)
+
+    thread = threading.Thread(target=worker, daemon=True)
+    thread.start()
+
+    if api_suffix == "/v1/chat/completions":
+        while True:
+            if not thread.is_alive() and result_q.empty():
+                break
+            try:
+                result = result_q.get(timeout=0.1)
+            except Empty:
+                continue
+
+            txt = result.strip() if isinstance(result, str) else str(result).strip()
+
+            yield {"text": txt}, log_state
+        return
+    else:
+        while thread.is_alive():
+            try:
+                result = result_q.get(timeout=0.1)
+                break
+            except Empty:
+                continue
+
+        if isinstance(result, str):
+            result = {"text": result}
+        elif not isinstance(result, dict) or "text" not in result:
+            result = {"text": str(result)}
+
+        yield result["text"], log_state
+        return
+
+    
+# ────────────────── Gradio UI ──────────────────
+with gr.Blocks(title="调试界面") as demo:
+    gr.Markdown(f"## 💬 调试界面  \n权重 **{MODEL_PATH.name}**")
+
+    with gr.Row():
+        api_choice = gr.Dropdown(choices=["/generate", "/v1/completions", "/v1/chat/completions"],
+                                value="/generate", label="选择推理接口")
+        
+    with gr.Row():
+        max_new = gr.Slider(32, 32768, 1024, label="max_new_tokens")
+        temp    = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
+    with gr.Row():
+        top_p   = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
+        top_k   = gr.Slider(0, 200, 50, step=1, label="top_k")
+    with gr.Row():
+        rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
+        pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
+    stop_txt = gr.Textbox("", label="stop 序列（逗号分隔）")
+
+    log_state = gr.State("")
+    dbg_chk   = gr.Checkbox(label="📜 显示 Debug 面板", value=False)
+    log_box   = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)
+
+    chat = gr.ChatInterface(
+        fn=chat,
+        additional_inputs=[max_new, temp, top_p, top_k,
+                        rep_pen, pres_pen, stop_txt,
+                        api_choice, log_state],
+        additional_outputs=[log_state],
+        type="messages"
+    )
+
+    timer = gr.Timer(1.0, render=True)
+    timer.tick(
+        fn=consume_logs,
+        inputs=[],
+        outputs=[log_box],
+    )
+
+    def clear_all_logs(_):
+        global LOG_Q, LOG_TXT, prev_log_value
+        with LOG_Q.mutex:
+            LOG_Q.queue.clear()
+        LOG_TXT = ""
+        prev_log_value = ""
+        return gr.update(value=""), gr.update(value="")
+
+    api_choice.change(fn=clear_all_logs, inputs=api_choice, outputs=[log_state, log_box])
+    log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
+    dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
+
+
+demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/meta_ui.py.old
+++ b/meta_ui.py.old
@ -0,0 +1,79 @@
+import gradio as gr
+import requests
+
+API_URL = "http://localhost:30000/v1/completions"
+API_KEY = "token-abc123"
+MODEL_NAME = "Qwen3-14b-base"
+
+# 构造 prompt：Base 模型靠拼接上下文
+def build_prompt(history, user_message):
+    prompt = ""
+    for user, bot in history:
+        prompt += f"User: {user}\nAssistant: {bot}\n"
+    prompt += f"User: {user_message}\nAssistant:"
+    return prompt
+
+# 主对话函数
+def chat(user_message, history, max_tokens, temperature):
+    prompt = build_prompt(history, user_message)
+
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": MODEL_NAME,
+        "prompt": prompt,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "stop": ["\nUser:", "\nAssistant:"]
+    }
+
+    try:
+        response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
+        result = response.json()
+        reply = result["choices"][0]["text"].strip()
+    except Exception as e:
+        reply = f"[请求失败] {e}"
+
+    return reply
+
+# 手动测试 API 功能
+def test_api_connection(max_tokens, temperature):
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": MODEL_NAME,
+        "prompt": "Ping?",
+        "max_tokens": max_tokens,
+        "temperature": temperature
+    }
+
+    try:
+        resp = requests.post(API_URL, headers=headers, json=payload, timeout=10)
+        out = resp.json()["choices"][0]["text"].strip()
+        return f"✅ API 可用，响应: {out}"
+    except Exception as e:
+        return f"❌ API 请求失败: {e}"
+
+# Gradio 控件组合
+with gr.Blocks(title="Base 模型测试 UI") as demo:
+    gr.Markdown("# 💬 Base 模型对话界面")
+
+    with gr.Row():
+        max_tokens = gr.Slider(32, 1024, value=256, label="max_tokens")
+        temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
+        test_btn = gr.Button("🔁 测试 API 可用性")
+        test_output = gr.Textbox(label="API 测试结果", interactive=False)
+
+    chatbot = gr.ChatInterface(
+        fn=lambda msg, hist: chat(msg, hist, max_tokens.value, temperature.value),
+        title=None
+    )
+
+    test_btn.click(fn=test_api_connection, inputs=[max_tokens, temperature], outputs=test_output)
+
+# 启动服务
+demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/meta_ui_old.py
+++ b/meta_ui_old.py
@ -0,0 +1,153 @@
+import json, datetime, textwrap, requests, gradio as gr
+from pathlib import Path
+from collections import deque
+import queue, threading, time
+
+# ───────────────────── 基础配置 ─────────────────────
+API_URL    = "http://localhost:30000/generate"
+API_KEY    = "token-abc123"
+MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
+
+def model_name(path: Path):
+    cfg = path / "config.json"
+    if cfg.exists():
+        data = json.load(cfg.open())
+        return data.get("architectures", [None])[0] or data.get("model_type") or path.name
+    return path.name
+
+MODEL_NAME = model_name(MODEL_PATH)
+now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
+
+# ───────────────────── 日志队列 ─────────────────────
+LOG_Q: "queue.Queue[str]" = queue.Queue()
+LOG_TXT = ""  # ✅ 全局日志缓存，避免 chat 焦点阻断 log_box 更新
+
+def log(msg):                 # 写终端 + 推队列
+    print(msg, flush=True)
+    LOG_Q.put(msg)
+
+prev_log_value = ""  # 上一帧的日志内容
+
+def consume_logs(dummy=None):
+    """每秒更新 log_box 内容，避免 chat 阻塞 UI 刷新"""
+    global LOG_TXT, prev_log_value
+    buf = deque(LOG_TXT.splitlines(), maxlen=400)
+    while not LOG_Q.empty():
+        buf.append(LOG_Q.get())
+    LOG_TXT = "\n".join(buf)
+    if LOG_TXT != prev_log_value:
+        prev_log_value = LOG_TXT
+        return gr.update(value=LOG_TXT)
+    return gr.update()  # 无更新则不触发前端刷新
+
+
+# ───────────────────── 后端调用 ─────────────────────
+def backend(text, sampling):
+    payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
+    log(f"\n🟡 [{now()}] payload\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
+    try:
+        r = requests.post(API_URL,
+                          headers={"Authorization": f"Bearer {API_KEY}",
+                                   "Content-Type": "application/json"},
+                          json=payload, timeout=180)
+        try:
+            data = r.json()
+        except Exception:
+            data = {}
+        fr   = data.get("meta_info", {}).get("finish_reason")
+        ctok = data.get("meta_info", {}).get("completion_tokens")
+        log(f"🟢 [{now()}] HTTP {r.status_code}  tokens={ctok}  finish={fr}\n"
+            f"🟢 resp800={r.text[:800]!r}")
+        if r.status_code != 200:
+            return f"[HTTP {r.status_code}] {r.text[:300]}"
+        return data.get("text", "").strip() or "[⚠ 空]"
+    except Exception as e:
+        log(f"[❌ 请求异常] {e}")
+        return f"[❌ 请求异常] {e}"
+
+# ───────────────────── Chat 回调 ─────────────────────
+def chat(
+    user, history,
+    max_new, temp, top_p, top_k,
+    rep_pen, pres_pen, stop_raw,
+    log_state
+):
+    import threading
+    from queue import Queue, Empty
+
+    stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
+    samp = {
+        "max_new_tokens": int(max_new),
+        "temperature": temp,
+        "top_p": top_p,
+        "top_k": int(top_k),
+        "repetition_penalty": rep_pen,
+        "presence_penalty": pres_pen,
+        **({"stop": stop} if stop else {})
+    }
+
+    result_q = Queue()
+
+    # 后台线程执行 backend 推理
+    def worker():
+        out = backend(user, samp)
+        result_q.put(out)
+
+    thread = threading.Thread(target=worker)
+    thread.start()
+
+    # 先返回提示
+    yield "⏳ 正在生成中...", log_state
+
+    # 每 0.1 秒轮询结果队列（避免阻塞 UI）
+    while thread.is_alive() or not result_q.empty():
+        try:
+            result = result_q.get(timeout=0.1)
+            yield result, log_state
+        except Empty:
+            continue
+
+
+# ───────────────────── Gradio UI ─────────────────────
+with gr.Blocks(title="调试界面") as demo:
+    gr.Markdown(f"## 💬 调试界面  \n权重 **{MODEL_PATH.name}**")
+
+    # 采样参数控件
+    with gr.Row():
+        max_new = gr.Slider(32, 32768, 128, label="max_new_tokens")
+        temp    = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
+    with gr.Row():
+        top_p   = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
+        top_k   = gr.Slider(0, 200, 50, step=1, label="top_k")
+    with gr.Row():
+        rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
+        pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
+    stop_txt = gr.Textbox("", label="stop 序列（逗号分隔）")
+
+    log_state = gr.State("")  # 状态透传
+    dbg_chk   = gr.Checkbox(label="📜 显示 Debug 面板", value=False)  # ✅ 默认关闭
+    log_box   = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)  # ✅ 默认隐藏
+
+    # Chat 界面（移到日志之前）
+    chatbot = gr.ChatInterface(
+        fn=chat,
+        additional_inputs=[max_new, temp, top_p, top_k,
+                           rep_pen, pres_pen, stop_txt, log_state],
+        additional_outputs=[log_state],
+        type="messages"
+    )
+
+
+    # 日志刷新定时器
+    timer = gr.Timer(1.0, render=True)
+    timer.tick(
+        fn=consume_logs,
+        inputs=[],
+        outputs=[log_box],
+    )
+
+    log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
+    dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
+
+
+demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/moe_kernels/triton_3_3_1/E=128,N=192,device_name=NVIDIA_GeForce_RTX_3090.json
+++ b/moe_kernels/triton_3_3_1/E=128,N=192,device_name=NVIDIA_GeForce_RTX_3090.json
@ -0,0 +1,10 @@
+{
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
--- a/sglang/python/sglang/srt/entrypoints/http_server.py
+++ b/sglang/python/sglang/srt/entrypoints/http_server.py
@ -216,9 +216,13 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))


@app.get("/health")
-async def health() -> Response:
-    """Check the health of the http server."""
-    return Response(status_code=200)
+async def health():
+    """Check the health of the http server and return version info."""
+    return {
+        "status": "ok",
+        "name": "sglang_0.4.8.post1",
+        "version": "v1.0.0"  # 这里写上你希望显示的版本号
+    }


@app.get("/health_generate")
--- a/sglang/python/sglang/srt/utils.py
+++ b/sglang/python/sglang/srt/utils.py
@ -868,12 +868,22 @@ def set_ulimit(target_soft_limit=65535):
 def add_api_key_middleware(app, api_key: str):
    @app.middleware("http")
    async def authentication(request, call_next):
+        # OPTIONS 请求（CORS 预检）直接放行
        if request.method == "OPTIONS":
            return await call_next(request)
-        if request.url.path.startswith("/health"):
-            return await call_next(request)
-        if request.url.path.startswith("/metrics"):
+
+        # 明确列出无需鉴权的路径前缀
+        whitelist_prefixes = (
+            "/health",
+            "/metrics",
+            "/ping",
+            "/get_model_info",
+        )
+
+        if any(request.url.path.startswith(prefix) for prefix in whitelist_prefixes):
            return await call_next(request)
+
+        # Bearer Token 鉴权
        if request.headers.get("Authorization") != "Bearer " + api_key:
            return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
        return await call_next(request)
--- a/supervisord.conf
+++ b/supervisord.conf
@ -0,0 +1,23 @@
+[supervisord]
+nodaemon=true
+logfile=/dev/stdout
+logfile_maxbytes=0
+loglevel=info
+
+[program:sglang]
+command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+
+[program:ui]
+command=python3 /app/meta_ui.py --port 30001
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
--- a/supervisord_qwen3-30b-a3b.conf
+++ b/supervisord_qwen3-30b-a3b.conf
@ -0,0 +1,23 @@
+[supervisord]
+nodaemon=true
+logfile=/dev/stdout
+logfile_maxbytes=0
+loglevel=info
+
+[program:sglang]
+command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/Alibaba/Qwen3-30B-A3B/ --tp 4 --api-key token-abc123 --enable-metrics
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+
+[program:ui]
+command=python3 /app/meta_ui.py --port 30001
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
Author	SHA1	Message	Date
hailin	29de4e1411	.	2025-09-14 19:12:27 +08:00
hailin	c516e234c0	.	2025-09-14 18:46:44 +08:00
hailin	106e5784e2	.	2025-09-14 18:39:52 +08:00
hailin	7669db4b55	.	2025-09-14 18:07:18 +08:00
hailin	af007765a3	.	2025-09-03 10:33:08 +08:00
hailin	363c90da1b	.	2025-09-03 10:01:21 +08:00
hailin	54fd416073	.	2025-08-01 20:17:23 +08:00
hailin	01ce15ddeb	.	2025-08-01 20:15:06 +08:00
hailin	aec50e2029	.	2025-08-01 14:41:37 +08:00
hailin	45c24387d9	.	2025-08-01 14:34:13 +08:00
hailin	db9e41c3e0	.	2025-08-01 14:27:35 +08:00
hailin	f32175aa48	.	2025-08-01 14:25:54 +08:00
hailin	effd559734	.	2025-08-01 14:13:26 +08:00
hailin	a2cc08abc6	.	2025-08-01 14:04:09 +08:00
hailin	e71c4823ef	.	2025-08-01 13:53:06 +08:00
hailin	ebe7f87009	.	2025-08-01 13:34:30 +08:00
hailin	66b11eb836	.	2025-08-01 13:33:54 +08:00
hailin	d2df3af90f	.	2025-08-01 11:54:41 +08:00
hailin	47bb4e366e	.	2025-08-01 11:43:27 +08:00
hailin	452a2ed902	.	2025-08-01 11:30:15 +08:00
hailin	d33a596dfa	.	2025-08-01 11:12:44 +08:00
hailin	985871bf02	.	2025-08-01 11:07:47 +08:00
hailin	eb6f9ba605	.	2025-08-01 11:00:32 +08:00
hailin	342727753a	.	2025-08-01 10:32:10 +08:00
hailin	0b2a49fe2c	.	2025-08-01 10:23:29 +08:00
hailin	89053e46ef	.	2025-08-01 10:15:09 +08:00
hailin	08e5939764	.	2025-08-01 10:02:15 +08:00
hailin	d4823afc81	.	2025-08-01 09:52:03 +08:00
hailin	99a6957d04	.	2025-08-01 09:45:14 +08:00
hailin	7c375562cd	.	2025-08-01 09:36:07 +08:00
hailin	26f8dc9ab5	.	2025-08-01 09:28:41 +08:00
hailin	f86051512d	.	2025-07-31 10:21:30 +08:00
hailin	0b24f7e814	.	2025-07-27 19:37:58 +08:00
hailin	9cb53f50f6	.	2025-07-27 19:32:27 +08:00
hailin	91194df5d8	.	2025-07-27 19:12:14 +08:00
hailin	0ce5191d31	.	2025-07-27 19:07:21 +08:00
hailin	095311d016	.	2025-07-27 18:53:06 +08:00
hailin	f904c754e2	.	2025-07-27 18:44:59 +08:00
hailin	79abd2bbdd	.	2025-07-27 18:34:25 +08:00
hailin	900be3e02d	.	2025-07-27 18:25:42 +08:00
hailin	4bb857f22f	.	2025-07-27 18:18:31 +08:00
hailin	44c3814d13	.	2025-07-27 17:24:27 +08:00
hailin	7bdc80cd1e	.	2025-07-27 17:16:47 +08:00
hailin	8f12b8269a	.	2025-07-27 17:05:54 +08:00
hailin	34c0c43673	.	2025-07-27 16:56:46 +08:00
hailin	6d8fbdc748	.	2025-07-27 16:43:48 +08:00
hailin	244d407937	.	2025-07-27 16:38:15 +08:00
hailin	f8a7f93747	.	2025-07-27 16:26:55 +08:00
hailin	c912bd2f74	.	2025-07-27 16:07:58 +08:00
hailin	6137a2e0d3	.	2025-07-27 16:05:31 +08:00
hailin	3e8115b036	.	2025-07-27 16:00:36 +08:00
hailin	c8c95bd62f	.	2025-07-27 15:50:44 +08:00
hailin	871d5994af	.	2025-07-27 15:39:59 +08:00
hailin	c2b7ec20b8	.	2025-07-27 15:32:23 +08:00
hailin	5d640d814b	.	2025-07-27 15:22:59 +08:00
hailin	991f5c81a8	.	2025-07-27 15:21:08 +08:00
hailin	75c97d6423	.	2025-07-27 15:18:38 +08:00
hailin	4559c52759	.	2025-07-27 15:07:55 +08:00
hailin	8c2b8ca785	.	2025-07-27 15:05:16 +08:00
hailin	8282e562ae	.	2025-07-27 15:02:47 +08:00
hailin	0b560f7067	.	2025-07-27 15:00:44 +08:00
hailin	82e5957f8e	.	2025-07-27 12:42:37 +08:00
hailin	d18985e8a3	.	2025-07-27 12:35:49 +08:00
hailin	4071f51150	.	2025-07-27 12:30:04 +08:00
hailin	818a722192	.	2025-07-27 12:29:24 +08:00
hailin	68a12b4b4a	.	2025-07-27 12:23:08 +08:00
hailin	ccf3398741	.	2025-07-27 12:07:21 +08:00
hailin	b42b5f090b	.	2025-07-27 11:13:13 +08:00
hailin	0333b8af9c	.	2025-07-27 10:52:28 +08:00
hailin	f932f0bd5f	.	2025-07-27 10:13:06 +08:00
hailin	d1a2b815b3	.	2025-07-26 22:19:16 +08:00
hailin	49b8cae1bb	.	2025-07-26 16:55:42 +08:00
hailin	b70297ece1	.	2025-07-26 16:42:47 +08:00
hailin	f0e15aa1d8	.	2025-07-26 08:58:30 +08:00
hailin	d2f69be68d	.	2025-07-25 17:03:33 +08:00
hailin	6aa0932210	.	2025-07-25 16:48:48 +08:00
hailin	174a6b2d76	.	2025-07-25 16:30:15 +08:00
hailin	2cfc960bc3	.	2025-07-25 16:11:48 +08:00
hailin	222c46ef15	.	2025-07-25 16:05:44 +08:00
hailin	2e621b202d	.	2025-07-25 15:33:33 +08:00
hailin	b5036d09c3	.	2025-07-25 15:02:17 +08:00
hailin	39c32555d8	.	2025-07-25 14:58:06 +08:00
hailin	6ea2139b82	.	2025-07-25 12:32:56 +08:00
hailin	35ba2eab42	.	2025-07-25 12:19:03 +08:00
hailin	f82e6c567f	.	2025-07-25 11:48:27 +08:00
hailin	1a58b38c86	.	2025-07-24 13:09:00 +08:00
hailin	d795691369	.	2025-07-17 10:54:12 +08:00
hailin	e252241910	.	2025-07-16 12:47:43 +08:00
hailin	a2a93c7c4c	.	2025-07-07 15:24:18 +08:00
hailin	c5e4ef4a6d	.	2025-07-07 15:21:43 +08:00
hailin	8f6dc142af	.	2025-07-07 14:52:40 +08:00
hailin	9ca3ebe4bb	.	2025-07-07 14:01:39 +08:00
hailin	1d3223c4ae	.	2025-07-04 18:24:57 +08:00
hailin	023d2a0868	.	2025-07-04 17:45:05 +08:00