.

2025-09-14 19:12:27 +08:00 · 2025-09-14 18:46:44 +08:00 · 2025-09-14 18:39:52 +08:00 · 2025-09-14 18:07:18 +08:00 · 2025-09-03 10:33:08 +08:00 · 2025-09-03 10:01:21 +08:00
20 changed files with 2406 additions and 93 deletions
--- a/66
+++ b/66
@ -89,8 +89,9 @@ WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
+# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
@ -99,6 +100,7 @@ RUN mkdir -p /wheels && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
@ -108,6 +110,9 @@ RUN pip wheel \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 # ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
 RUN pip wheel "gradio==5.38.2" requests -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
@ -117,7 +122,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils ca-certificates \
+        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
@ -130,27 +135,35 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
 # 👇建议在后面补上
 RUN ldconfig
-COPY --from=builder-extras /wheels /tmp/wheels
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
 COPY --from=builder-extras /wheels /tmp/wheels
 #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels
 # # 安装运行时漏掉的依赖
 # RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton
-# ✅ 离线安装全部依赖（包含所有运行时必需包）
+
-# RUN python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+# ✅ 安装 Prometheus client
-#     python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+RUN python3 -m pip install --no-cache-dir prometheus_client
-#     rm -rf /tmp/wheels
+
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
@ -159,15 +172,20 @@ RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
+# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
-EXPOSE 30000
+EXPOSE 30000 30001
-# ---- 启动 SGLang 推理服务 ----
+# 安装 supervisor
-CMD ["python3", "-m", "sglang.launch_server", \
+RUN apt-get update && apt-get install -y supervisor && \
-     "--host", "0.0.0.0", \
+    mkdir -p /etc/supervisor/conf.d
-     "--port", "30000", \
+
-     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
+# 拷贝 supervisord 配置文件和 UI 脚本
-     "--tp", "1", \
+COPY ./meta_ui.py /app/meta_ui.py
-     "--api-key", "token-abc123"]
+COPY ./supervisord.conf /etc/supervisor/supervisord.conf
 # 作为容器主进程运行 supervisor
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.Qwen3-30B-A3B
+++ b/Dockerfile.Qwen3-30B-A3B
@ -0,0 +1,191 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 # ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
 RUN pip wheel "gradio==5.38.2" requests -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 # ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
 COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
 COPY --from=builder-extras /wheels /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000 30001
 # 安装 supervisor
 RUN apt-get update && apt-get install -y supervisor && \
    mkdir -p /etc/supervisor/conf.d
 # 拷贝 supervisord 配置文件和 UI 脚本
 COPY ./meta_ui.py /app/meta_ui.py
 COPY ./supervisord.conf /etc/supervisor/supervisord.conf
 # 作为容器主进程运行 supervisor
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.Qwen3-30B-A3B-Base
+++ b/Dockerfile.Qwen3-30B-A3B-Base
@ -0,0 +1,191 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 # ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
 RUN pip wheel "gradio==5.38.2" requests -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 # ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
 COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
 COPY --from=builder-extras /wheels /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-30B-A3B-Base /root/.cradle/Alibaba/Qwen3-30B-A3B-Base
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000 30001
 # 安装 supervisor
 RUN apt-get update && apt-get install -y supervisor && \
    mkdir -p /etc/supervisor/conf.d
 # 拷贝 supervisord 配置文件和 UI 脚本
 COPY ./meta_ui.py /app/meta_ui.py
 COPY ./supervisord.conf /etc/supervisor/supervisord.conf
 # 作为容器主进程运行 supervisor
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.bad
+++ b/Dockerfile.bad
@ -1,63 +0,0 @@
 ############################################################
 #  Stage-0: 构建依赖轮子（PyTorch + SGLang + sgl_kernel）   #
 ############################################################
 ARG CUDA_VERSION=12.8.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS builder
 # ---- Python 环境 ----
 RUN apt-get update && \
    apt-get install -y --no-install-recommends python3 python3-pip python3-distutils && \
    ln -sf /usr/bin/python3 /usr/bin/python && \
    python -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six
 # ---- PyTorch / torchvision / SGLang / sgl_kernel ----
 ARG TORCH_VER=2.7.1
 ARG TV_VER=0.22.1
 RUN case "$CUDA_VERSION" in \
        12.6.1) CUINDEX=126 ;; \
        12.8.1) CUINDEX=128 ;; \
        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
    esac && \
    python -m pip install --no-cache-dir \
        torch==${TORCH_VER}+cu${CUINDEX} \
        torchvision==${TV_VER}+cu${CUINDEX} \
        --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} && \
    python -m pip install --no-cache-dir \
        sglang==0.4.8.post1 \
        sgl-kernel==0.0.2.post17 \
        nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps && \
    # ✅ 补全依赖（必须）
    python -m pip install --no-cache-dir \
        pydantic psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle orjson uvloop sentencepiece
    # ✅ 测试模块完整性
    #python -c "import sglang, torch, pydantic, transformers, sgl_kernel"
 ############################################################
 #  Stage-1: 生成最小运行镜像                                #
 ############################################################
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1
 # ---- Python runtime ----
 RUN apt-get update && \
    apt-get install -y --no-install-recommends python3 python3-distutils && \
    ln -sf /usr/bin/python3 /usr/bin/python && \
    rm -rf /var/lib/apt/lists/*
 # ---- 拷贝 Python 包和入口 ----
 COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
 COPY --from=builder /usr/local/bin /usr/local/bin
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
 # ---- 启动服务 ----
 EXPOSE 30000
 CMD ["python3", "-m", "sglang.launch_server", \
     "--host", "0.0.0.0", \
     "--port", "30000", \
     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
     "--tp", "1", \
     "--api-key", "token-abc123"]
--- a/Dockerfile.ds_llama_70b
+++ b/Dockerfile.ds_llama_70b
@ -0,0 +1,177 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── sgl-kernel 的 Python 模块 ───────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 COPY --from=builder-extras /wheels /tmp/wheels
 COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
 #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000
 # ---- 启动 SGLang 推理服务 ----
 CMD ["python3", "-m", "sglang.launch_server", \
     "--host", "0.0.0.0", \
     "--port", "30000", \
     "--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \
     "--tp", "4", \
     "--api-key", "token-abc123", \
     "--enable-metrics"]
--- a/Dockerfile.llm_external
+++ b/Dockerfile.llm_external
@ -0,0 +1,191 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 # ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
 RUN pip wheel "gradio==5.38.2" requests -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 # ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
 COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
 COPY --from=builder-extras /wheels /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 # COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000 30001
 # 安装 supervisor
 RUN apt-get update && apt-get install -y supervisor && \
    mkdir -p /etc/supervisor/conf.d
 # 拷贝 supervisord 配置文件和 UI 脚本
 COPY ./meta_ui.py /app/meta_ui.py
 COPY ./supervisord.conf /etc/supervisor/supervisord.conf
 # 作为容器主进程运行 supervisor
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.qwen3-14b
+++ b/Dockerfile.qwen3-14b
@ -0,0 +1,177 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── sgl-kernel 的 Python 模块 ───────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 COPY --from=builder-extras /wheels /tmp/wheels
 COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
 #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-14B /root/.cradle/Alibaba/Qwen3-14B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000
 # ---- 启动 SGLang 推理服务 ----
 CMD ["python3", "-m", "sglang.launch_server", \
     "--host", "0.0.0.0", \
     "--port", "30000", \
     "--model-path", "/root/.cradle/Alibaba/Qwen3-14B/", \
     "--tp", "2", \
     "--api-key", "token-abc123", \
     "--enable-metrics"]
--- a/Dockerfile.qwen3-14b-base
+++ b/Dockerfile.qwen3-14b-base
@ -0,0 +1,183 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── sgl-kernel 的 Python 模块 ───────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 # ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
 RUN pip wheel gradio requests -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 COPY --from=builder-extras /wheels /tmp/wheels
 COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
 #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-14B-Base /root/.cradle/Alibaba/Qwen3-14B-Base
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000 30001
 # 安装 supervisor
 RUN apt-get update && apt-get install -y supervisor && \
    mkdir -p /etc/supervisor/conf.d
 # 拷贝 supervisord 配置文件和 UI 脚本
 COPY ./meta_ui.py /app/meta_ui.py
 COPY ./supervisord.conf /etc/supervisor/supervisord.conf
 # 作为容器主进程运行 supervisor
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.qwen3-32b
+++ b/Dockerfile.qwen3-32b
@ -0,0 +1,177 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── sgl-kernel 的 Python 模块 ───────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 COPY --from=builder-extras /wheels /tmp/wheels
 COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
 #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-32B /root/.cradle/Alibaba/Qwen3-32B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000
 # ---- 启动 SGLang 推理服务 ----
 CMD ["python3", "-m", "sglang.launch_server", \
     "--host", "0.0.0.0", \
     "--port", "30000", \
     "--model-path", "/root/.cradle/Alibaba/Qwen3-32B/", \
     "--tp", "4", \
     "--api-key", "token-abc123", \
     "--enable-metrics"]
--- a/Dockerfile.qwen3-8b
+++ b/Dockerfile.qwen3-8b
@ -0,0 +1,177 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── sgl-kernel 的 Python 模块 ───────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 COPY --from=builder-extras /wheels /tmp/wheels
 COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
 #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000
 # ---- 启动 SGLang 推理服务 ----
 CMD ["python3", "-m", "sglang.launch_server", \
     "--host", "0.0.0.0", \
     "--port", "30000", \
     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
     "--tp", "1", \
     "--api-key", "token-abc123", \
     "--enable-metrics"]
--- a/Dockerfile.qwq32b
+++ b/Dockerfile.qwq32b
@ -0,0 +1,177 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── sgl-kernel 的 Python 模块 ───────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 COPY --from=builder-extras /wheels /tmp/wheels
 COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
 #RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/QwQ-32B /root/.cradle/Alibaba/QwQ-32B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000
 # ---- 启动 SGLang 推理服务 ----
 CMD ["python3", "-m", "sglang.launch_server", \
     "--host", "0.0.0.0", \
     "--port", "30000", \
     "--model-path", "/root/.cradle/Alibaba/QwQ-32B/", \
     "--tp", "4", \
     "--api-key", "token-abc123", \
     "--enable-metrics"]
--- a/Dockerfile.tmp
+++ b/Dockerfile.tmp
@ -0,0 +1,191 @@
 ###############################################################################
 # Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
 ENV USE_CUDA=1 \
    USE_DISTRIBUTED=1 \
    USE_MPI=1 \
    USE_GLOO=1 \
    USE_NCCL=1 \
    USE_SYSTEM_NCCL=1 \
    BUILD_TEST=0
 ARG MAX_JOBS=90                       
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
      libopenblas-dev libopenmpi-dev \
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 \
      libjpeg-dev libpng-dev ca-certificates && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
 WORKDIR /opt
 RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
 WORKDIR /opt/pytorch
 ENV MAX_JOBS=${MAX_JOBS}
 RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
    python3 setup.py bdist_wheel
 ###############################################################################
 # Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
      python3 python3-pip python3-distutils python3.10-dev git build-essential \
      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
      libopenmpi-dev libopenblas-dev\
      libnccl2=2.22.3-1+cuda12.6 \
      libnccl-dev=2.22.3-1+cuda12.6 && \
    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
 # ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
 COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
 RUN set -e && \
    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
 # ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
 WORKDIR /opt
 RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
 WORKDIR /opt/vision
 RUN python3 setup.py bdist_wheel
 # ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
 WORKDIR /opt
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 WORKDIR /opt/flashinfer
 RUN pip install . && \
    python3 -m pip wheel . --no-deps -w dist/
 # # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
 # WORKDIR /opt
 # RUN pip install setuptools wheel setuptools_scm && \
 #     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
 #     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
 # ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
 WORKDIR /opt
 RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
 # ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
 COPY ./sglang /sgl/sglang
 WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
 # ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
 RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
    cp /tmp/torch_dist/torch*.whl /wheels/ && \
    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
 RUN pip wheel \
    pydantic orjson psutil pyzmq pynvml \
    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels
 # ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
 RUN pip wheel "gradio==5.38.2" requests -w /wheels
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
 ARG CUDA_VERSION=12.6.1
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
    && python3 -m pip install --no-cache-dir xgrammar
 # 👉 拷贝 cupti 动态库（避免写死版本号）
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
 COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
 # 👇建议在后面补上
 RUN ldconfig
 # ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
 COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
 COPY --from=builder-extras /wheels /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
    rm -rf /tmp/wheels
 # ✅ 安装 Prometheus client
 RUN python3 -m pip install --no-cache-dir prometheus_client
 # ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
 ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
 # ✅ 确保目录存在
 RUN mkdir -p /tmp/prometheus
 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
 RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]
 # ---- 拷贝模型（路径可换） ----
 COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
 HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
 # ---- 暴露端口 ----
 EXPOSE 30000 30001
 # 安装 supervisor
 RUN apt-get update && apt-get install -y supervisor && \
    mkdir -p /etc/supervisor/conf.d
 # 拷贝 supervisord 配置文件和 UI 脚本
 COPY ./meta_ui.py /app/meta_ui.py
 COPY ./supervisord.conf /etc/supervisor/supervisord.conf
 # 作为容器主进程运行 supervisor
 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/meta_ui.py
+++ b/meta_ui.py
@ -0,0 +1,224 @@
 import json, datetime, textwrap, requests, gradio as gr
 from pathlib import Path
 from collections import deque
 import queue, threading, time
 # ────────────────── 基础配置 ──────────────────
 API_KEY    = "token-abc123"
 MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
 def model_name(path: Path):
    cfg = path / "config.json"
    if cfg.exists():
        data = json.load(cfg.open())
        return data.get("architectures", [None])[0] or data.get("model_type") or path.name
    return path.name
 MODEL_NAME = model_name(MODEL_PATH)
 now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
 # ────────────────── 日志队列 ──────────────────
 LOG_Q: "queue.Queue[str]" = queue.Queue()
 LOG_TXT = ""
 def log(msg):
    print(msg, flush=True)
    LOG_Q.put(msg)
 prev_log_value = ""
 def consume_logs(dummy=None):
    global LOG_TXT, prev_log_value
    buf = deque(LOG_TXT.splitlines(), maxlen=400)
    while not LOG_Q.empty():
        buf.append(LOG_Q.get())
    LOG_TXT = "\n".join(buf)
    if LOG_TXT != prev_log_value:
        prev_log_value = LOG_TXT
        return gr.update(value=LOG_TXT)
    return gr.update()
 # ────────────────── 后端调用 ──────────────────
 def backend(text, sampling, api_suffix):
    url = f"http://localhost:30000{api_suffix}"
    if api_suffix == "/generate":
        payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
    elif api_suffix == "/v1/completions":
        payload = {
            "model": MODEL_NAME,
            "prompt": text,
            **sampling
        }
    elif api_suffix == "/v1/chat/completions":
        payload = {
            "model": MODEL_NAME,
            "messages": text,  # ← 这里 text 实际是 messages list
            **sampling
        }
    log(f"\n🟡 [{now()}] POST {url}\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
    try:
        r = requests.post(url,
                          headers={"Authorization": f"Bearer {API_KEY}",
                                   "Content-Type": "application/json"},
                          json=payload, timeout=180)
        try:
            data = r.json()
        except Exception:
            data = {}
        if api_suffix == "/generate":
            txt = data.get("text", "").strip()
            meta = data.get("meta_info", {})
            fr = meta.get("finish_reason")
            ctok = meta.get("completion_tokens")
        elif api_suffix == "/v1/completions":
            choice = data.get("choices", [{}])[0]
            txt = choice.get("text", "").strip()
            fr = choice.get("finish_reason")
            ctok = data.get("usage", {}).get("completion_tokens")
        elif api_suffix == "/v1/chat/completions":
            choice = data.get("choices", [{}])[0]
            msg = choice.get("message", {})
            txt = msg.get("content", "").strip()
            # 新增：从 usage 获取 completion_tokens
            ctok = data.get("usage", {}).get("completion_tokens")
            fr = choice.get("finish_reason")  # 如果后续需要 finish reason
        log(f"🟢 [{now()}] HTTP {r.status_code}  tokens={ctok}  finish={fr}\n"
            f"🟢 resp={r.text!r}") 
        if r.status_code != 200:
            return f"[HTTP {r.status_code}] {r.text}"
        return txt or "[⚠ 空]"
    except Exception as e:
        log(f"[❌ 请求异常] {e}")
        return f"[❌ 请求异常] {e}"
 # ────────────────── Chat 回调 ──────────────────
 def chat(
    user_msg, history,
    max_new, temp, top_p, top_k,
    rep_pen, pres_pen, stop_raw,
    api_suffix, log_state
 ):
    from queue import Queue, Empty
    user = user_msg["text"] if isinstance(user_msg, dict) and "text" in user_msg else user_msg
    if api_suffix == "/v1/chat/completions":
        # 给 LLM 的完整 history（用于上下文推理）
        messages = history[:]  
        messages.append({"role": "user", "content": user})
        prompt_input = messages
    else:
        prompt_input = user
    stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
    samp = {
        ("max_tokens" if api_suffix == "/v1/completions" else "max_new_tokens"): int(max_new),
        "temperature": temp,
        "top_p": top_p,
        "top_k": int(top_k),
        "repetition_penalty": rep_pen,
        "presence_penalty": pres_pen,
        **({"stop": stop} if stop else {})
    }
    result_q = Queue()
    def worker():
        out = backend(prompt_input, samp, api_suffix)
        result_q.put(out)
    thread = threading.Thread(target=worker, daemon=True)
    thread.start()
    if api_suffix == "/v1/chat/completions":
        while True:
            if not thread.is_alive() and result_q.empty():
                break
            try:
                result = result_q.get(timeout=0.1)
            except Empty:
                continue
            txt = result.strip() if isinstance(result, str) else str(result).strip()
            yield {"text": txt}, log_state
        return
    else:
        while thread.is_alive():
            try:
                result = result_q.get(timeout=0.1)
                break
            except Empty:
                continue
        if isinstance(result, str):
            result = {"text": result}
        elif not isinstance(result, dict) or "text" not in result:
            result = {"text": str(result)}
        yield result["text"], log_state
        return
 # ────────────────── Gradio UI ──────────────────
 with gr.Blocks(title="调试界面") as demo:
    gr.Markdown(f"## 💬 调试界面  \n权重 **{MODEL_PATH.name}**")
    with gr.Row():
        api_choice = gr.Dropdown(choices=["/generate", "/v1/completions", "/v1/chat/completions"],
                                value="/generate", label="选择推理接口")
    with gr.Row():
        max_new = gr.Slider(32, 32768, 1024, label="max_new_tokens")
        temp    = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
    with gr.Row():
        top_p   = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
        top_k   = gr.Slider(0, 200, 50, step=1, label="top_k")
    with gr.Row():
        rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
        pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
    stop_txt = gr.Textbox("", label="stop 序列（逗号分隔）")
    log_state = gr.State("")
    dbg_chk   = gr.Checkbox(label="📜 显示 Debug 面板", value=False)
    log_box   = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)
    chat = gr.ChatInterface(
        fn=chat,
        additional_inputs=[max_new, temp, top_p, top_k,
                        rep_pen, pres_pen, stop_txt,
                        api_choice, log_state],
        additional_outputs=[log_state],
        type="messages"
    )
    timer = gr.Timer(1.0, render=True)
    timer.tick(
        fn=consume_logs,
        inputs=[],
        outputs=[log_box],
    )
    def clear_all_logs(_):
        global LOG_Q, LOG_TXT, prev_log_value
        with LOG_Q.mutex:
            LOG_Q.queue.clear()
        LOG_TXT = ""
        prev_log_value = ""
        return gr.update(value=""), gr.update(value="")
    api_choice.change(fn=clear_all_logs, inputs=api_choice, outputs=[log_state, log_box])
    log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
    dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
 demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/meta_ui.py.old
+++ b/meta_ui.py.old
@ -0,0 +1,79 @@
 import gradio as gr
 import requests
 API_URL = "http://localhost:30000/v1/completions"
 API_KEY = "token-abc123"
 MODEL_NAME = "Qwen3-14b-base"
 # 构造 prompt：Base 模型靠拼接上下文
 def build_prompt(history, user_message):
    prompt = ""
    for user, bot in history:
        prompt += f"User: {user}\nAssistant: {bot}\n"
    prompt += f"User: {user_message}\nAssistant:"
    return prompt
 # 主对话函数
 def chat(user_message, history, max_tokens, temperature):
    prompt = build_prompt(history, user_message)
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": ["\nUser:", "\nAssistant:"]
    }
    try:
        response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
        result = response.json()
        reply = result["choices"][0]["text"].strip()
    except Exception as e:
        reply = f"[请求失败] {e}"
    return reply
 # 手动测试 API 功能
 def test_api_connection(max_tokens, temperature):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": MODEL_NAME,
        "prompt": "Ping?",
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    try:
        resp = requests.post(API_URL, headers=headers, json=payload, timeout=10)
        out = resp.json()["choices"][0]["text"].strip()
        return f"✅ API 可用，响应: {out}"
    except Exception as e:
        return f"❌ API 请求失败: {e}"
 # Gradio 控件组合
 with gr.Blocks(title="Base 模型测试 UI") as demo:
    gr.Markdown("# 💬 Base 模型对话界面")
    with gr.Row():
        max_tokens = gr.Slider(32, 1024, value=256, label="max_tokens")
        temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
        test_btn = gr.Button("🔁 测试 API 可用性")
        test_output = gr.Textbox(label="API 测试结果", interactive=False)
    chatbot = gr.ChatInterface(
        fn=lambda msg, hist: chat(msg, hist, max_tokens.value, temperature.value),
        title=None
    )
    test_btn.click(fn=test_api_connection, inputs=[max_tokens, temperature], outputs=test_output)
 # 启动服务
 demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/meta_ui_old.py
+++ b/meta_ui_old.py
@ -0,0 +1,153 @@
 import json, datetime, textwrap, requests, gradio as gr
 from pathlib import Path
 from collections import deque
 import queue, threading, time
 # ───────────────────── 基础配置 ─────────────────────
 API_URL    = "http://localhost:30000/generate"
 API_KEY    = "token-abc123"
 MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
 def model_name(path: Path):
    cfg = path / "config.json"
    if cfg.exists():
        data = json.load(cfg.open())
        return data.get("architectures", [None])[0] or data.get("model_type") or path.name
    return path.name
 MODEL_NAME = model_name(MODEL_PATH)
 now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
 # ───────────────────── 日志队列 ─────────────────────
 LOG_Q: "queue.Queue[str]" = queue.Queue()
 LOG_TXT = ""  # ✅ 全局日志缓存，避免 chat 焦点阻断 log_box 更新
 def log(msg):                 # 写终端 + 推队列
    print(msg, flush=True)
    LOG_Q.put(msg)
 prev_log_value = ""  # 上一帧的日志内容
 def consume_logs(dummy=None):
    """每秒更新 log_box 内容，避免 chat 阻塞 UI 刷新"""
    global LOG_TXT, prev_log_value
    buf = deque(LOG_TXT.splitlines(), maxlen=400)
    while not LOG_Q.empty():
        buf.append(LOG_Q.get())
    LOG_TXT = "\n".join(buf)
    if LOG_TXT != prev_log_value:
        prev_log_value = LOG_TXT
        return gr.update(value=LOG_TXT)
    return gr.update()  # 无更新则不触发前端刷新
 # ───────────────────── 后端调用 ─────────────────────
 def backend(text, sampling):
    payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
    log(f"\n🟡 [{now()}] payload\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
    try:
        r = requests.post(API_URL,
                          headers={"Authorization": f"Bearer {API_KEY}",
                                   "Content-Type": "application/json"},
                          json=payload, timeout=180)
        try:
            data = r.json()
        except Exception:
            data = {}
        fr   = data.get("meta_info", {}).get("finish_reason")
        ctok = data.get("meta_info", {}).get("completion_tokens")
        log(f"🟢 [{now()}] HTTP {r.status_code}  tokens={ctok}  finish={fr}\n"
            f"🟢 resp800={r.text[:800]!r}")
        if r.status_code != 200:
            return f"[HTTP {r.status_code}] {r.text[:300]}"
        return data.get("text", "").strip() or "[⚠ 空]"
    except Exception as e:
        log(f"[❌ 请求异常] {e}")
        return f"[❌ 请求异常] {e}"
 # ───────────────────── Chat 回调 ─────────────────────
 def chat(
    user, history,
    max_new, temp, top_p, top_k,
    rep_pen, pres_pen, stop_raw,
    log_state
 ):
    import threading
    from queue import Queue, Empty
    stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
    samp = {
        "max_new_tokens": int(max_new),
        "temperature": temp,
        "top_p": top_p,
        "top_k": int(top_k),
        "repetition_penalty": rep_pen,
        "presence_penalty": pres_pen,
        **({"stop": stop} if stop else {})
    }
    result_q = Queue()
    # 后台线程执行 backend 推理
    def worker():
        out = backend(user, samp)
        result_q.put(out)
    thread = threading.Thread(target=worker)
    thread.start()
    # 先返回提示
    yield "⏳ 正在生成中...", log_state
    # 每 0.1 秒轮询结果队列（避免阻塞 UI）
    while thread.is_alive() or not result_q.empty():
        try:
            result = result_q.get(timeout=0.1)
            yield result, log_state
        except Empty:
            continue
 # ───────────────────── Gradio UI ─────────────────────
 with gr.Blocks(title="调试界面") as demo:
    gr.Markdown(f"## 💬 调试界面  \n权重 **{MODEL_PATH.name}**")
    # 采样参数控件
    with gr.Row():
        max_new = gr.Slider(32, 32768, 128, label="max_new_tokens")
        temp    = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
    with gr.Row():
        top_p   = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
        top_k   = gr.Slider(0, 200, 50, step=1, label="top_k")
    with gr.Row():
        rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
        pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
    stop_txt = gr.Textbox("", label="stop 序列（逗号分隔）")
    log_state = gr.State("")  # 状态透传
    dbg_chk   = gr.Checkbox(label="📜 显示 Debug 面板", value=False)  # ✅ 默认关闭
    log_box   = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)  # ✅ 默认隐藏
    # Chat 界面（移到日志之前）
    chatbot = gr.ChatInterface(
        fn=chat,
        additional_inputs=[max_new, temp, top_p, top_k,
                           rep_pen, pres_pen, stop_txt, log_state],
        additional_outputs=[log_state],
        type="messages"
    )
    # 日志刷新定时器
    timer = gr.Timer(1.0, render=True)
    timer.tick(
        fn=consume_logs,
        inputs=[],
        outputs=[log_box],
    )
    log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
    dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
 demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/moe_kernels/triton_3_3_1/E=128,N=192,device_name=NVIDIA_GeForce_RTX_3090.json
+++ b/moe_kernels/triton_3_3_1/E=128,N=192,device_name=NVIDIA_GeForce_RTX_3090.json
@ -0,0 +1,10 @@
 {
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 32,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/sglang/python/sglang/srt/entrypoints/http_server.py
+++ b/sglang/python/sglang/srt/entrypoints/http_server.py
@ -216,9 +216,13 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
@app.get("/health")
-async def health() -> Response:
+async def health():
-    """Check the health of the http server."""
+    """Check the health of the http server and return version info."""
-    return Response(status_code=200)
+    return {
        "status": "ok",
        "name": "sglang_0.4.8.post1",
        "version": "v1.0.0"  # 这里写上你希望显示的版本号
    }
@app.get("/health_generate")
--- a/sglang/python/sglang/srt/utils.py
+++ b/sglang/python/sglang/srt/utils.py
@ -868,12 +868,22 @@ def set_ulimit(target_soft_limit=65535):
 def add_api_key_middleware(app, api_key: str):
    @app.middleware("http")
    async def authentication(request, call_next):
        # OPTIONS 请求（CORS 预检）直接放行
        if request.method == "OPTIONS":
            return await call_next(request)
-        if request.url.path.startswith("/health"):
+
-            return await call_next(request)
+        # 明确列出无需鉴权的路径前缀
-        if request.url.path.startswith("/metrics"):
+        whitelist_prefixes = (
            "/health",
            "/metrics",
            "/ping",
            "/get_model_info",
        )
        if any(request.url.path.startswith(prefix) for prefix in whitelist_prefixes):
            return await call_next(request)
        # Bearer Token 鉴权
        if request.headers.get("Authorization") != "Bearer " + api_key:
            return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
        return await call_next(request)
--- a/supervisord.conf
+++ b/supervisord.conf
@ -0,0 +1,23 @@
 [supervisord]
 nodaemon=true
 logfile=/dev/stdout
 logfile_maxbytes=0
 loglevel=info
 [program:sglang]
 command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics
 autostart=true
 autorestart=true
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
 [program:ui]
 command=python3 /app/meta_ui.py --port 30001
 autostart=true
 autorestart=true
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
--- a/supervisord_qwen3-30b-a3b.conf
+++ b/supervisord_qwen3-30b-a3b.conf
@ -0,0 +1,23 @@
 [supervisord]
 nodaemon=true
 logfile=/dev/stdout
 logfile_maxbytes=0
 loglevel=info
 [program:sglang]
 command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/Alibaba/Qwen3-30B-A3B/ --tp 4 --api-key token-abc123 --enable-metrics
 autostart=true
 autorestart=true
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
 [program:ui]
 command=python3 /app/meta_ui.py --port 30001
 autostart=true
 autorestart=true
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
Author	SHA1	Message	Date
hailin	29de4e1411	.	2025-09-14 19:12:27 +08:00
hailin	c516e234c0	.	2025-09-14 18:46:44 +08:00
hailin	106e5784e2	.	2025-09-14 18:39:52 +08:00
hailin	7669db4b55	.	2025-09-14 18:07:18 +08:00
hailin	af007765a3	.	2025-09-03 10:33:08 +08:00
hailin	363c90da1b	.	2025-09-03 10:01:21 +08:00
hailin	54fd416073	.	2025-08-01 20:17:23 +08:00
hailin	01ce15ddeb	.	2025-08-01 20:15:06 +08:00
hailin	aec50e2029	.	2025-08-01 14:41:37 +08:00
hailin	45c24387d9	.	2025-08-01 14:34:13 +08:00
hailin	db9e41c3e0	.	2025-08-01 14:27:35 +08:00
hailin	f32175aa48	.	2025-08-01 14:25:54 +08:00
hailin	effd559734	.	2025-08-01 14:13:26 +08:00
hailin	a2cc08abc6	.	2025-08-01 14:04:09 +08:00
hailin	e71c4823ef	.	2025-08-01 13:53:06 +08:00
hailin	ebe7f87009	.	2025-08-01 13:34:30 +08:00
hailin	66b11eb836	.	2025-08-01 13:33:54 +08:00
hailin	d2df3af90f	.	2025-08-01 11:54:41 +08:00
hailin	47bb4e366e	.	2025-08-01 11:43:27 +08:00
hailin	452a2ed902	.	2025-08-01 11:30:15 +08:00
hailin	d33a596dfa	.	2025-08-01 11:12:44 +08:00
hailin	985871bf02	.	2025-08-01 11:07:47 +08:00
hailin	eb6f9ba605	.	2025-08-01 11:00:32 +08:00
hailin	342727753a	.	2025-08-01 10:32:10 +08:00
hailin	0b2a49fe2c	.	2025-08-01 10:23:29 +08:00
hailin	89053e46ef	.	2025-08-01 10:15:09 +08:00
hailin	08e5939764	.	2025-08-01 10:02:15 +08:00
hailin	d4823afc81	.	2025-08-01 09:52:03 +08:00
hailin	99a6957d04	.	2025-08-01 09:45:14 +08:00
hailin	7c375562cd	.	2025-08-01 09:36:07 +08:00
hailin	26f8dc9ab5	.	2025-08-01 09:28:41 +08:00
hailin	f86051512d	.	2025-07-31 10:21:30 +08:00
hailin	0b24f7e814	.	2025-07-27 19:37:58 +08:00
hailin	9cb53f50f6	.	2025-07-27 19:32:27 +08:00
hailin	91194df5d8	.	2025-07-27 19:12:14 +08:00
hailin	0ce5191d31	.	2025-07-27 19:07:21 +08:00
hailin	095311d016	.	2025-07-27 18:53:06 +08:00
hailin	f904c754e2	.	2025-07-27 18:44:59 +08:00
hailin	79abd2bbdd	.	2025-07-27 18:34:25 +08:00
hailin	900be3e02d	.	2025-07-27 18:25:42 +08:00
hailin	4bb857f22f	.	2025-07-27 18:18:31 +08:00
hailin	44c3814d13	.	2025-07-27 17:24:27 +08:00
hailin	7bdc80cd1e	.	2025-07-27 17:16:47 +08:00
hailin	8f12b8269a	.	2025-07-27 17:05:54 +08:00
hailin	34c0c43673	.	2025-07-27 16:56:46 +08:00
hailin	6d8fbdc748	.	2025-07-27 16:43:48 +08:00
hailin	244d407937	.	2025-07-27 16:38:15 +08:00
hailin	f8a7f93747	.	2025-07-27 16:26:55 +08:00
hailin	c912bd2f74	.	2025-07-27 16:07:58 +08:00
hailin	6137a2e0d3	.	2025-07-27 16:05:31 +08:00
hailin	3e8115b036	.	2025-07-27 16:00:36 +08:00
hailin	c8c95bd62f	.	2025-07-27 15:50:44 +08:00
hailin	871d5994af	.	2025-07-27 15:39:59 +08:00
hailin	c2b7ec20b8	.	2025-07-27 15:32:23 +08:00
hailin	5d640d814b	.	2025-07-27 15:22:59 +08:00
hailin	991f5c81a8	.	2025-07-27 15:21:08 +08:00
hailin	75c97d6423	.	2025-07-27 15:18:38 +08:00
hailin	4559c52759	.	2025-07-27 15:07:55 +08:00
hailin	8c2b8ca785	.	2025-07-27 15:05:16 +08:00
hailin	8282e562ae	.	2025-07-27 15:02:47 +08:00
hailin	0b560f7067	.	2025-07-27 15:00:44 +08:00
hailin	82e5957f8e	.	2025-07-27 12:42:37 +08:00
hailin	d18985e8a3	.	2025-07-27 12:35:49 +08:00
hailin	4071f51150	.	2025-07-27 12:30:04 +08:00
hailin	818a722192	.	2025-07-27 12:29:24 +08:00
hailin	68a12b4b4a	.	2025-07-27 12:23:08 +08:00
hailin	ccf3398741	.	2025-07-27 12:07:21 +08:00
hailin	b42b5f090b	.	2025-07-27 11:13:13 +08:00
hailin	0333b8af9c	.	2025-07-27 10:52:28 +08:00
hailin	f932f0bd5f	.	2025-07-27 10:13:06 +08:00
hailin	d1a2b815b3	.	2025-07-26 22:19:16 +08:00
hailin	49b8cae1bb	.	2025-07-26 16:55:42 +08:00
hailin	b70297ece1	.	2025-07-26 16:42:47 +08:00
hailin	f0e15aa1d8	.	2025-07-26 08:58:30 +08:00
hailin	d2f69be68d	.	2025-07-25 17:03:33 +08:00
hailin	6aa0932210	.	2025-07-25 16:48:48 +08:00
hailin	174a6b2d76	.	2025-07-25 16:30:15 +08:00
hailin	2cfc960bc3	.	2025-07-25 16:11:48 +08:00
hailin	222c46ef15	.	2025-07-25 16:05:44 +08:00
hailin	2e621b202d	.	2025-07-25 15:33:33 +08:00
hailin	b5036d09c3	.	2025-07-25 15:02:17 +08:00
hailin	39c32555d8	.	2025-07-25 14:58:06 +08:00
hailin	6ea2139b82	.	2025-07-25 12:32:56 +08:00
hailin	35ba2eab42	.	2025-07-25 12:19:03 +08:00
hailin	f82e6c567f	.	2025-07-25 11:48:27 +08:00
hailin	1a58b38c86	.	2025-07-24 13:09:00 +08:00
hailin	d795691369	.	2025-07-17 10:54:12 +08:00
hailin	e252241910	.	2025-07-16 12:47:43 +08:00
hailin	a2a93c7c4c	.	2025-07-07 15:24:18 +08:00
hailin	c5e4ef4a6d	.	2025-07-07 15:21:43 +08:00
hailin	8f6dc142af	.	2025-07-07 14:52:40 +08:00
hailin	9ca3ebe4bb	.	2025-07-07 14:01:39 +08:00
hailin	1d3223c4ae	.	2025-07-04 18:24:57 +08:00
hailin	023d2a0868	.	2025-07-04 17:45:05 +08:00