20 changed files with 92 additions and 2405 deletions
--- a/64
+++ b/64
@ -89,9 +89,8 @@ WORKDIR /sgl/sglang/python
 RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels

-
-# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
+# ── sgl-kernel 的 Python 模块 ───────────────────────────────
+RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  

 # ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
 RUN mkdir -p /wheels && \
@ -100,7 +99,6 @@ RUN mkdir -p /wheels && \
    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels

 # ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
@ -110,9 +108,6 @@ RUN pip wheel \
    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
    -w /wheels

-# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
-RUN pip wheel "gradio==5.38.2" requests -w /wheels
-
 ###############################################################################
 # Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
 ###############################################################################
@ -122,7 +117,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8

 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
+        python3 python3-dev python3-pip python3-distutils ca-certificates \
        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
    rm -rf /var/lib/apt/lists/* && \
    python3 -m pip install --no-cache-dir --upgrade pip \
@ -135,35 +130,27 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
 # 👇建议在后面补上
 RUN ldconfig

-# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
-COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
-
-
 COPY --from=builder-extras /wheels /tmp/wheels
+COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel

+#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
-    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
-    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
+    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
    rm -rf /tmp/wheels

+# # 安装运行时漏掉的依赖
+# RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton

-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
+# ✅ 离线安装全部依赖（包含所有运行时必需包）
+# RUN python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
+#     python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
+#     rm -rf /tmp/wheels

 # ✅ 添加 Tini（推荐）
 ENV TINI_VERSION=v0.19.0
@ -172,20 +159,15 @@ RUN chmod +x /tini
 ENTRYPOINT ["/tini", "--"]

 # ---- 拷贝模型（路径可换） ----
-# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
+COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B

 # ---- 暴露端口 ----
-EXPOSE 30000 30001
+EXPOSE 30000

-# 安装 supervisor
-RUN apt-get update && apt-get install -y supervisor && \
-    mkdir -p /etc/supervisor/conf.d
-
-# 拷贝 supervisord 配置文件和 UI 脚本
-COPY ./meta_ui.py /app/meta_ui.py
-COPY ./supervisord.conf /etc/supervisor/supervisord.conf
-
-# 作为容器主进程运行 supervisor
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
+# ---- 启动 SGLang 推理服务 ----
+CMD ["python3", "-m", "sglang.launch_server", \
+     "--host", "0.0.0.0", \
+     "--port", "30000", \
+     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
+     "--tp", "1", \
+     "--api-key", "token-abc123"]
--- a/Dockerfile.Qwen3-30B-A3B
+++ b/Dockerfile.Qwen3-30B-A3B
@ -1,191 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-
-# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
-RUN pip wheel "gradio==5.38.2" requests -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
-COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
-
-
-COPY --from=builder-extras /wheels /tmp/wheels
-
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
-    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
-    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
-    rm -rf /tmp/wheels
-
-
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000 30001
-
-# 安装 supervisor
-RUN apt-get update && apt-get install -y supervisor && \
-    mkdir -p /etc/supervisor/conf.d
-
-# 拷贝 supervisord 配置文件和 UI 脚本
-COPY ./meta_ui.py /app/meta_ui.py
-COPY ./supervisord.conf /etc/supervisor/supervisord.conf
-
-# 作为容器主进程运行 supervisor
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.Qwen3-30B-A3B-Base
+++ b/Dockerfile.Qwen3-30B-A3B-Base
@ -1,191 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-
-# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
-RUN pip wheel "gradio==5.38.2" requests -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
-COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
-
-
-COPY --from=builder-extras /wheels /tmp/wheels
-
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
-    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
-    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
-    rm -rf /tmp/wheels
-
-
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-30B-A3B-Base /root/.cradle/Alibaba/Qwen3-30B-A3B-Base
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000 30001
-
-# 安装 supervisor
-RUN apt-get update && apt-get install -y supervisor && \
-    mkdir -p /etc/supervisor/conf.d
-
-# 拷贝 supervisord 配置文件和 UI 脚本
-COPY ./meta_ui.py /app/meta_ui.py
-COPY ./supervisord.conf /etc/supervisor/supervisord.conf
-
-# 作为容器主进程运行 supervisor
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.bad
+++ b/Dockerfile.bad
@ -0,0 +1,63 @@
+############################################################
+#  Stage-0: 构建依赖轮子（PyTorch + SGLang + sgl_kernel）   #
+############################################################
+ARG CUDA_VERSION=12.8.1
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS builder
+
+# ---- Python 环境 ----
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends python3 python3-pip python3-distutils && \
+    ln -sf /usr/bin/python3 /usr/bin/python && \
+    python -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six
+
+# ---- PyTorch / torchvision / SGLang / sgl_kernel ----
+ARG TORCH_VER=2.7.1
+ARG TV_VER=0.22.1
+RUN case "$CUDA_VERSION" in \
+        12.6.1) CUINDEX=126 ;; \
+        12.8.1) CUINDEX=128 ;; \
+        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
+    esac && \
+    python -m pip install --no-cache-dir \
+        torch==${TORCH_VER}+cu${CUINDEX} \
+        torchvision==${TV_VER}+cu${CUINDEX} \
+        --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} && \
+    python -m pip install --no-cache-dir \
+        sglang==0.4.8.post1 \
+        sgl-kernel==0.0.2.post17 \
+        nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps && \
+    # ✅ 补全依赖（必须）
+    python -m pip install --no-cache-dir \
+        pydantic psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle orjson uvloop sentencepiece
+    # ✅ 测试模块完整性
+    #python -c "import sglang, torch, pydantic, transformers, sgl_kernel"
+
+############################################################
+#  Stage-1: 生成最小运行镜像                                #
+############################################################
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1
+
+# ---- Python runtime ----
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends python3 python3-distutils && \
+    ln -sf /usr/bin/python3 /usr/bin/python && \
+    rm -rf /var/lib/apt/lists/*
+
+# ---- 拷贝 Python 包和入口 ----
+COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# ---- 拷贝模型（路径可换） ----
+COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
+
+# ---- 启动服务 ----
+EXPOSE 30000
+CMD ["python3", "-m", "sglang.launch_server", \
+     "--host", "0.0.0.0", \
+     "--port", "30000", \
+     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
+     "--tp", "1", \
+     "--api-key", "token-abc123"]
--- a/Dockerfile.ds_llama_70b
+++ b/Dockerfile.ds_llama_70b
@ -1,177 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-COPY --from=builder-extras /wheels /tmp/wheels
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
-
-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
-    rm -rf /tmp/wheels
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000
-
-# ---- 启动 SGLang 推理服务 ----
-CMD ["python3", "-m", "sglang.launch_server", \
-     "--host", "0.0.0.0", \
-     "--port", "30000", \
-     "--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \
-     "--tp", "4", \
-     "--api-key", "token-abc123", \
-     "--enable-metrics"]
--- a/Dockerfile.llm_external
+++ b/Dockerfile.llm_external
@ -1,191 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-
-# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
-RUN pip wheel "gradio==5.38.2" requests -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
-COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
-
-
-COPY --from=builder-extras /wheels /tmp/wheels
-
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
-    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
-    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
-    rm -rf /tmp/wheels
-
-
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000 30001
-
-# 安装 supervisor
-RUN apt-get update && apt-get install -y supervisor && \
-    mkdir -p /etc/supervisor/conf.d
-
-# 拷贝 supervisord 配置文件和 UI 脚本
-COPY ./meta_ui.py /app/meta_ui.py
-COPY ./supervisord.conf /etc/supervisor/supervisord.conf
-
-# 作为容器主进程运行 supervisor
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.qwen3-14b
+++ b/Dockerfile.qwen3-14b
@ -1,177 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-COPY --from=builder-extras /wheels /tmp/wheels
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
-
-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
-    rm -rf /tmp/wheels
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-14B /root/.cradle/Alibaba/Qwen3-14B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000
-
-# ---- 启动 SGLang 推理服务 ----
-CMD ["python3", "-m", "sglang.launch_server", \
-     "--host", "0.0.0.0", \
-     "--port", "30000", \
-     "--model-path", "/root/.cradle/Alibaba/Qwen3-14B/", \
-     "--tp", "2", \
-     "--api-key", "token-abc123", \
-     "--enable-metrics"]
--- a/Dockerfile.qwen3-14b-base
+++ b/Dockerfile.qwen3-14b-base
@ -1,183 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
-RUN pip wheel gradio requests -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-COPY --from=builder-extras /wheels /tmp/wheels
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
-
-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
-    rm -rf /tmp/wheels
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-14B-Base /root/.cradle/Alibaba/Qwen3-14B-Base
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000 30001
-
-# 安装 supervisor
-RUN apt-get update && apt-get install -y supervisor && \
-    mkdir -p /etc/supervisor/conf.d
-
-# 拷贝 supervisord 配置文件和 UI 脚本
-COPY ./meta_ui.py /app/meta_ui.py
-COPY ./supervisord.conf /etc/supervisor/supervisord.conf
-
-# 作为容器主进程运行 supervisor
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/Dockerfile.qwen3-32b
+++ b/Dockerfile.qwen3-32b
@ -1,177 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-COPY --from=builder-extras /wheels /tmp/wheels
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
-
-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
-    rm -rf /tmp/wheels
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-32B /root/.cradle/Alibaba/Qwen3-32B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000
-
-# ---- 启动 SGLang 推理服务 ----
-CMD ["python3", "-m", "sglang.launch_server", \
-     "--host", "0.0.0.0", \
-     "--port", "30000", \
-     "--model-path", "/root/.cradle/Alibaba/Qwen3-32B/", \
-     "--tp", "4", \
-     "--api-key", "token-abc123", \
-     "--enable-metrics"]
--- a/Dockerfile.qwen3-8b
+++ b/Dockerfile.qwen3-8b
@ -1,177 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-COPY --from=builder-extras /wheels /tmp/wheels
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
-
-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
-    rm -rf /tmp/wheels
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000
-
-# ---- 启动 SGLang 推理服务 ----
-CMD ["python3", "-m", "sglang.launch_server", \
-     "--host", "0.0.0.0", \
-     "--port", "30000", \
-     "--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
-     "--tp", "1", \
-     "--api-key", "token-abc123", \
-     "--enable-metrics"]
--- a/Dockerfile.qwq32b
+++ b/Dockerfile.qwq32b
@ -1,177 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-# ── sgl-kernel 的 Python 模块 ───────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel  
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-COPY --from=builder-extras /wheels /tmp/wheels
-COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
-
-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
-    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
-    rm -rf /tmp/wheels
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/QwQ-32B /root/.cradle/Alibaba/QwQ-32B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000
-
-# ---- 启动 SGLang 推理服务 ----
-CMD ["python3", "-m", "sglang.launch_server", \
-     "--host", "0.0.0.0", \
-     "--port", "30000", \
-     "--model-path", "/root/.cradle/Alibaba/QwQ-32B/", \
-     "--tp", "4", \
-     "--api-key", "token-abc123", \
-     "--enable-metrics"]
--- a/Dockerfile.tmp
+++ b/Dockerfile.tmp
@ -1,191 +0,0 @@
-###############################################################################
-# Stage 0 ─ builder-torch：编译 PyTorch 2.7.1 (+cu126)
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
-
-ENV USE_CUDA=1 \
-    USE_DISTRIBUTED=1 \
-    USE_MPI=1 \
-    USE_GLOO=1 \
-    USE_NCCL=1 \
-    USE_SYSTEM_NCCL=1 \
-    BUILD_TEST=0
-
-ARG MAX_JOBS=90                       
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
-    TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
-      libopenblas-dev libopenmpi-dev \
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 \
-      libjpeg-dev libpng-dev ca-certificates && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
-
-WORKDIR /opt
-RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
-
-WORKDIR /opt/pytorch
-ENV MAX_JOBS=${MAX_JOBS}
-RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
-    python3 setup.py bdist_wheel
-
-###############################################################################
-# Stage 1 ─ builder-extras：用自编 Torch 装 TV / flashinfer / sglang，并收集轮子
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      python3 python3-pip python3-distutils python3.10-dev git build-essential \
-      cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
-      libopenmpi-dev libopenblas-dev\
-      libnccl2=2.22.3-1+cuda12.6 \
-      libnccl-dev=2.22.3-1+cuda12.6 && \
-    python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
-
-# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
-COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
-RUN set -e && \
-    echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
-    find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
-
-
-
-# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
-WORKDIR /opt
-RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
-WORKDIR /opt/vision
-RUN python3 setup.py bdist_wheel
-
-# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
-WORKDIR /opt
-RUN git clone https://github.com/flashinfer-ai/flashinfer.git
-WORKDIR /opt/flashinfer
-
-RUN pip install . && \
-    python3 -m pip wheel . --no-deps -w dist/
-
-
-# # ── 安装 vllm（跳过编译，直接装） ─────────────────────────────────────────────
-# WORKDIR /opt
-# RUN pip install setuptools wheel setuptools_scm && \
-#     pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
-#     python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
-
-# ── 下载 vllm 预编译 wheel，避免编译 flash-attn ───────────────────────────────
-WORKDIR /opt
-RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
-
-
-# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
-COPY ./sglang /sgl/sglang
-WORKDIR /sgl/sglang/python
-RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
-    python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
-
-
-# ── 🔄 下载 sgl-kernel（与 sglang 同步）───────────────────────────────────────
-RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
-
-# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
-RUN mkdir -p /wheels && \
-    cp /tmp/torch_dist/torch*.whl /wheels/ && \
-    cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
-    cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
-    cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
-    cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
-    cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl   /wheels/ && \
-    pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
-
-# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
-RUN pip wheel \
-    pydantic orjson psutil pyzmq pynvml \
-    transformers==4.52.0 uvicorn fastapi IPython aiohttp \
-    setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-    -w /wheels
-
-# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
-RUN pip wheel "gradio==5.38.2" requests -w /wheels
-
-###############################################################################
-# Stage 2 ─ runtime：极简运行镜像，仅离线安装 wheel
-###############################################################################
-ARG CUDA_VERSION=12.6.1
-FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
-
-ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
-
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
-        python3 python3-dev python3-pip python3-distutils curl ca-certificates \
-        libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3 -m pip install --no-cache-dir --upgrade pip \
-    && python3 -m pip install --no-cache-dir xgrammar
-
-# 👉 拷贝 cupti 动态库（避免写死版本号）
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
-COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
-
-# 👇建议在后面补上
-RUN ldconfig
-
-# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
-COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
-
-
-COPY --from=builder-extras /wheels /tmp/wheels
-
-# ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
-RUN ls -lh /tmp/wheels && \
-    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
-    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
-    python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
-    python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
-    python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
-    rm -rf /tmp/wheels
-
-
-
-# ✅ 安装 Prometheus client
-RUN python3 -m pip install --no-cache-dir prometheus_client
-
-# ✅ 设置多进程 metrics 收集目录（用于 MultiProcessCollector）
-ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
-
-# ✅ 确保目录存在
-RUN mkdir -p /tmp/prometheus
-
-# ✅ 添加 Tini（推荐）
-ENV TINI_VERSION=v0.19.0
-ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
-RUN chmod +x /tini
-ENTRYPOINT ["/tini", "--"]
-
-# ---- 拷贝模型（路径可换） ----
-COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
-
-HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
-
-# ---- 暴露端口 ----
-EXPOSE 30000 30001
-
-# 安装 supervisor
-RUN apt-get update && apt-get install -y supervisor && \
-    mkdir -p /etc/supervisor/conf.d
-
-# 拷贝 supervisord 配置文件和 UI 脚本
-COPY ./meta_ui.py /app/meta_ui.py
-COPY ./supervisord.conf /etc/supervisor/supervisord.conf
-
-# 作为容器主进程运行 supervisor
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/meta_ui.py
+++ b/meta_ui.py
@ -1,224 +0,0 @@
-import json, datetime, textwrap, requests, gradio as gr
-from pathlib import Path
-from collections import deque
-import queue, threading, time
-
-# ────────────────── 基础配置 ──────────────────
-API_KEY    = "token-abc123"
-MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
-
-
-def model_name(path: Path):
-    cfg = path / "config.json"
-    if cfg.exists():
-        data = json.load(cfg.open())
-        return data.get("architectures", [None])[0] or data.get("model_type") or path.name
-    return path.name
-
-MODEL_NAME = model_name(MODEL_PATH)
-now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
-
-# ────────────────── 日志队列 ──────────────────
-LOG_Q: "queue.Queue[str]" = queue.Queue()
-LOG_TXT = ""
-
-
-def log(msg):
-    print(msg, flush=True)
-    LOG_Q.put(msg)
-
-
-prev_log_value = ""
-
-def consume_logs(dummy=None):
-    global LOG_TXT, prev_log_value
-    buf = deque(LOG_TXT.splitlines(), maxlen=400)
-    while not LOG_Q.empty():
-        buf.append(LOG_Q.get())
-    LOG_TXT = "\n".join(buf)
-    if LOG_TXT != prev_log_value:
-        prev_log_value = LOG_TXT
-        return gr.update(value=LOG_TXT)
-    return gr.update()
-
-
-# ────────────────── 后端调用 ──────────────────
-def backend(text, sampling, api_suffix):
-    url = f"http://localhost:30000{api_suffix}"
-    if api_suffix == "/generate":
-        payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
-    elif api_suffix == "/v1/completions":
-        payload = {
-            "model": MODEL_NAME,
-            "prompt": text,
-            **sampling
-        }
-    elif api_suffix == "/v1/chat/completions":
-        payload = {
-            "model": MODEL_NAME,
-            "messages": text,  # ← 这里 text 实际是 messages list
-            **sampling
-        }
-        
-    log(f"\n🟡 [{now()}] POST {url}\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
-    try:
-        r = requests.post(url,
-                          headers={"Authorization": f"Bearer {API_KEY}",
-                                   "Content-Type": "application/json"},
-                          json=payload, timeout=180)
-        try:
-            data = r.json()
-        except Exception:
-            data = {}
-
-        if api_suffix == "/generate":
-            txt = data.get("text", "").strip()
-            meta = data.get("meta_info", {})
-            fr = meta.get("finish_reason")
-            ctok = meta.get("completion_tokens")
-        elif api_suffix == "/v1/completions":
-            choice = data.get("choices", [{}])[0]
-            txt = choice.get("text", "").strip()
-            fr = choice.get("finish_reason")
-            ctok = data.get("usage", {}).get("completion_tokens")
-        elif api_suffix == "/v1/chat/completions":
-            choice = data.get("choices", [{}])[0]
-            msg = choice.get("message", {})
-            txt = msg.get("content", "").strip()
-
-            # 新增：从 usage 获取 completion_tokens
-            ctok = data.get("usage", {}).get("completion_tokens")
-            fr = choice.get("finish_reason")  # 如果后续需要 finish reason
-
-        log(f"🟢 [{now()}] HTTP {r.status_code}  tokens={ctok}  finish={fr}\n"
-            f"🟢 resp={r.text!r}") 
-        if r.status_code != 200:
-            return f"[HTTP {r.status_code}] {r.text}"
-        return txt or "[⚠ 空]"
-    except Exception as e:
-        log(f"[❌ 请求异常] {e}")
-        return f"[❌ 请求异常] {e}"
-
-
-# ────────────────── Chat 回调 ──────────────────
-def chat(
-    user_msg, history,
-    max_new, temp, top_p, top_k,
-    rep_pen, pres_pen, stop_raw,
-    api_suffix, log_state
-):
-    from queue import Queue, Empty
-
-    user = user_msg["text"] if isinstance(user_msg, dict) and "text" in user_msg else user_msg
-
-    if api_suffix == "/v1/chat/completions":
-        # 给 LLM 的完整 history（用于上下文推理）
-        messages = history[:]  
-        messages.append({"role": "user", "content": user})
-        prompt_input = messages
-    else:
-        prompt_input = user
-
-    stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
-    samp = {
-        ("max_tokens" if api_suffix == "/v1/completions" else "max_new_tokens"): int(max_new),
-        "temperature": temp,
-        "top_p": top_p,
-        "top_k": int(top_k),
-        "repetition_penalty": rep_pen,
-        "presence_penalty": pres_pen,
-        **({"stop": stop} if stop else {})
-    }
-
-    result_q = Queue()
-
-    def worker():
-        out = backend(prompt_input, samp, api_suffix)
-        result_q.put(out)
-
-    thread = threading.Thread(target=worker, daemon=True)
-    thread.start()
-
-    if api_suffix == "/v1/chat/completions":
-        while True:
-            if not thread.is_alive() and result_q.empty():
-                break
-            try:
-                result = result_q.get(timeout=0.1)
-            except Empty:
-                continue
-
-            txt = result.strip() if isinstance(result, str) else str(result).strip()
-
-            yield {"text": txt}, log_state
-        return
-    else:
-        while thread.is_alive():
-            try:
-                result = result_q.get(timeout=0.1)
-                break
-            except Empty:
-                continue
-
-        if isinstance(result, str):
-            result = {"text": result}
-        elif not isinstance(result, dict) or "text" not in result:
-            result = {"text": str(result)}
-
-        yield result["text"], log_state
-        return
-
-    
-# ────────────────── Gradio UI ──────────────────
-with gr.Blocks(title="调试界面") as demo:
-    gr.Markdown(f"## 💬 调试界面  \n权重 **{MODEL_PATH.name}**")
-
-    with gr.Row():
-        api_choice = gr.Dropdown(choices=["/generate", "/v1/completions", "/v1/chat/completions"],
-                                value="/generate", label="选择推理接口")
-        
-    with gr.Row():
-        max_new = gr.Slider(32, 32768, 1024, label="max_new_tokens")
-        temp    = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
-    with gr.Row():
-        top_p   = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
-        top_k   = gr.Slider(0, 200, 50, step=1, label="top_k")
-    with gr.Row():
-        rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
-        pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
-    stop_txt = gr.Textbox("", label="stop 序列（逗号分隔）")
-
-    log_state = gr.State("")
-    dbg_chk   = gr.Checkbox(label="📜 显示 Debug 面板", value=False)
-    log_box   = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)
-
-    chat = gr.ChatInterface(
-        fn=chat,
-        additional_inputs=[max_new, temp, top_p, top_k,
-                        rep_pen, pres_pen, stop_txt,
-                        api_choice, log_state],
-        additional_outputs=[log_state],
-        type="messages"
-    )
-
-    timer = gr.Timer(1.0, render=True)
-    timer.tick(
-        fn=consume_logs,
-        inputs=[],
-        outputs=[log_box],
-    )
-
-    def clear_all_logs(_):
-        global LOG_Q, LOG_TXT, prev_log_value
-        with LOG_Q.mutex:
-            LOG_Q.queue.clear()
-        LOG_TXT = ""
-        prev_log_value = ""
-        return gr.update(value=""), gr.update(value="")
-
-    api_choice.change(fn=clear_all_logs, inputs=api_choice, outputs=[log_state, log_box])
-    log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
-    dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
-
-
-demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/meta_ui.py.old
+++ b/meta_ui.py.old
@ -1,79 +0,0 @@
-import gradio as gr
-import requests
-
-API_URL = "http://localhost:30000/v1/completions"
-API_KEY = "token-abc123"
-MODEL_NAME = "Qwen3-14b-base"
-
-# 构造 prompt：Base 模型靠拼接上下文
-def build_prompt(history, user_message):
-    prompt = ""
-    for user, bot in history:
-        prompt += f"User: {user}\nAssistant: {bot}\n"
-    prompt += f"User: {user_message}\nAssistant:"
-    return prompt
-
-# 主对话函数
-def chat(user_message, history, max_tokens, temperature):
-    prompt = build_prompt(history, user_message)
-
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json"
-    }
-    payload = {
-        "model": MODEL_NAME,
-        "prompt": prompt,
-        "max_tokens": max_tokens,
-        "temperature": temperature,
-        "stop": ["\nUser:", "\nAssistant:"]
-    }
-
-    try:
-        response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
-        result = response.json()
-        reply = result["choices"][0]["text"].strip()
-    except Exception as e:
-        reply = f"[请求失败] {e}"
-
-    return reply
-
-# 手动测试 API 功能
-def test_api_connection(max_tokens, temperature):
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json"
-    }
-    payload = {
-        "model": MODEL_NAME,
-        "prompt": "Ping?",
-        "max_tokens": max_tokens,
-        "temperature": temperature
-    }
-
-    try:
-        resp = requests.post(API_URL, headers=headers, json=payload, timeout=10)
-        out = resp.json()["choices"][0]["text"].strip()
-        return f"✅ API 可用，响应: {out}"
-    except Exception as e:
-        return f"❌ API 请求失败: {e}"
-
-# Gradio 控件组合
-with gr.Blocks(title="Base 模型测试 UI") as demo:
-    gr.Markdown("# 💬 Base 模型对话界面")
-
-    with gr.Row():
-        max_tokens = gr.Slider(32, 1024, value=256, label="max_tokens")
-        temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
-        test_btn = gr.Button("🔁 测试 API 可用性")
-        test_output = gr.Textbox(label="API 测试结果", interactive=False)
-
-    chatbot = gr.ChatInterface(
-        fn=lambda msg, hist: chat(msg, hist, max_tokens.value, temperature.value),
-        title=None
-    )
-
-    test_btn.click(fn=test_api_connection, inputs=[max_tokens, temperature], outputs=test_output)
-
-# 启动服务
-demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/meta_ui_old.py
+++ b/meta_ui_old.py
@ -1,153 +0,0 @@
-import json, datetime, textwrap, requests, gradio as gr
-from pathlib import Path
-from collections import deque
-import queue, threading, time
-
-# ───────────────────── 基础配置 ─────────────────────
-API_URL    = "http://localhost:30000/generate"
-API_KEY    = "token-abc123"
-MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
-
-def model_name(path: Path):
-    cfg = path / "config.json"
-    if cfg.exists():
-        data = json.load(cfg.open())
-        return data.get("architectures", [None])[0] or data.get("model_type") or path.name
-    return path.name
-
-MODEL_NAME = model_name(MODEL_PATH)
-now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
-
-# ───────────────────── 日志队列 ─────────────────────
-LOG_Q: "queue.Queue[str]" = queue.Queue()
-LOG_TXT = ""  # ✅ 全局日志缓存，避免 chat 焦点阻断 log_box 更新
-
-def log(msg):                 # 写终端 + 推队列
-    print(msg, flush=True)
-    LOG_Q.put(msg)
-
-prev_log_value = ""  # 上一帧的日志内容
-
-def consume_logs(dummy=None):
-    """每秒更新 log_box 内容，避免 chat 阻塞 UI 刷新"""
-    global LOG_TXT, prev_log_value
-    buf = deque(LOG_TXT.splitlines(), maxlen=400)
-    while not LOG_Q.empty():
-        buf.append(LOG_Q.get())
-    LOG_TXT = "\n".join(buf)
-    if LOG_TXT != prev_log_value:
-        prev_log_value = LOG_TXT
-        return gr.update(value=LOG_TXT)
-    return gr.update()  # 无更新则不触发前端刷新
-
-
-# ───────────────────── 后端调用 ─────────────────────
-def backend(text, sampling):
-    payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
-    log(f"\n🟡 [{now()}] payload\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
-    try:
-        r = requests.post(API_URL,
-                          headers={"Authorization": f"Bearer {API_KEY}",
-                                   "Content-Type": "application/json"},
-                          json=payload, timeout=180)
-        try:
-            data = r.json()
-        except Exception:
-            data = {}
-        fr   = data.get("meta_info", {}).get("finish_reason")
-        ctok = data.get("meta_info", {}).get("completion_tokens")
-        log(f"🟢 [{now()}] HTTP {r.status_code}  tokens={ctok}  finish={fr}\n"
-            f"🟢 resp800={r.text[:800]!r}")
-        if r.status_code != 200:
-            return f"[HTTP {r.status_code}] {r.text[:300]}"
-        return data.get("text", "").strip() or "[⚠ 空]"
-    except Exception as e:
-        log(f"[❌ 请求异常] {e}")
-        return f"[❌ 请求异常] {e}"
-
-# ───────────────────── Chat 回调 ─────────────────────
-def chat(
-    user, history,
-    max_new, temp, top_p, top_k,
-    rep_pen, pres_pen, stop_raw,
-    log_state
-):
-    import threading
-    from queue import Queue, Empty
-
-    stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
-    samp = {
-        "max_new_tokens": int(max_new),
-        "temperature": temp,
-        "top_p": top_p,
-        "top_k": int(top_k),
-        "repetition_penalty": rep_pen,
-        "presence_penalty": pres_pen,
-        **({"stop": stop} if stop else {})
-    }
-
-    result_q = Queue()
-
-    # 后台线程执行 backend 推理
-    def worker():
-        out = backend(user, samp)
-        result_q.put(out)
-
-    thread = threading.Thread(target=worker)
-    thread.start()
-
-    # 先返回提示
-    yield "⏳ 正在生成中...", log_state
-
-    # 每 0.1 秒轮询结果队列（避免阻塞 UI）
-    while thread.is_alive() or not result_q.empty():
-        try:
-            result = result_q.get(timeout=0.1)
-            yield result, log_state
-        except Empty:
-            continue
-
-
-# ───────────────────── Gradio UI ─────────────────────
-with gr.Blocks(title="调试界面") as demo:
-    gr.Markdown(f"## 💬 调试界面  \n权重 **{MODEL_PATH.name}**")
-
-    # 采样参数控件
-    with gr.Row():
-        max_new = gr.Slider(32, 32768, 128, label="max_new_tokens")
-        temp    = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
-    with gr.Row():
-        top_p   = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
-        top_k   = gr.Slider(0, 200, 50, step=1, label="top_k")
-    with gr.Row():
-        rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
-        pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
-    stop_txt = gr.Textbox("", label="stop 序列（逗号分隔）")
-
-    log_state = gr.State("")  # 状态透传
-    dbg_chk   = gr.Checkbox(label="📜 显示 Debug 面板", value=False)  # ✅ 默认关闭
-    log_box   = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)  # ✅ 默认隐藏
-
-    # Chat 界面（移到日志之前）
-    chatbot = gr.ChatInterface(
-        fn=chat,
-        additional_inputs=[max_new, temp, top_p, top_k,
-                           rep_pen, pres_pen, stop_txt, log_state],
-        additional_outputs=[log_state],
-        type="messages"
-    )
-
-
-    # 日志刷新定时器
-    timer = gr.Timer(1.0, render=True)
-    timer.tick(
-        fn=consume_logs,
-        inputs=[],
-        outputs=[log_box],
-    )
-
-    log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
-    dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
-
-
-demo.launch(server_name="0.0.0.0", server_port=30001)
--- a/moe_kernels/triton_3_3_1/E=128,N=192,device_name=NVIDIA_GeForce_RTX_3090.json
+++ b/moe_kernels/triton_3_3_1/E=128,N=192,device_name=NVIDIA_GeForce_RTX_3090.json
@ -1,10 +0,0 @@
-{
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    }
-}
--- a/sglang/python/sglang/srt/entrypoints/http_server.py
+++ b/sglang/python/sglang/srt/entrypoints/http_server.py
@ -216,13 +216,9 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))


@app.get("/health")
-async def health():
-    """Check the health of the http server and return version info."""
-    return {
-        "status": "ok",
-        "name": "sglang_0.4.8.post1",
-        "version": "v1.0.0"  # 这里写上你希望显示的版本号
-    }
+async def health() -> Response:
+    """Check the health of the http server."""
+    return Response(status_code=200)


@app.get("/health_generate")
--- a/sglang/python/sglang/srt/utils.py
+++ b/sglang/python/sglang/srt/utils.py
@ -868,22 +868,12 @@ def set_ulimit(target_soft_limit=65535):
 def add_api_key_middleware(app, api_key: str):
    @app.middleware("http")
    async def authentication(request, call_next):
-        # OPTIONS 请求（CORS 预检）直接放行
        if request.method == "OPTIONS":
            return await call_next(request)
-
-        # 明确列出无需鉴权的路径前缀
-        whitelist_prefixes = (
-            "/health",
-            "/metrics",
-            "/ping",
-            "/get_model_info",
-        )
-
-        if any(request.url.path.startswith(prefix) for prefix in whitelist_prefixes):
+        if request.url.path.startswith("/health"):
+            return await call_next(request)
+        if request.url.path.startswith("/metrics"):
            return await call_next(request)
-
-        # Bearer Token 鉴权
        if request.headers.get("Authorization") != "Bearer " + api_key:
            return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
        return await call_next(request)
--- a/supervisord.conf
+++ b/supervisord.conf
@ -1,23 +0,0 @@
-[supervisord]
-nodaemon=true
-logfile=/dev/stdout
-logfile_maxbytes=0
-loglevel=info
-
-[program:sglang]
-command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics
-autostart=true
-autorestart=true
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
-
-[program:ui]
-command=python3 /app/meta_ui.py --port 30001
-autostart=true
-autorestart=true
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
--- a/supervisord_qwen3-30b-a3b.conf
+++ b/supervisord_qwen3-30b-a3b.conf
@ -1,23 +0,0 @@
-[supervisord]
-nodaemon=true
-logfile=/dev/stdout
-logfile_maxbytes=0
-loglevel=info
-
-[program:sglang]
-command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/Alibaba/Qwen3-30B-A3B/ --tp 4 --api-key token-abc123 --enable-metrics
-autostart=true
-autorestart=true
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
-
-[program:ui]
-command=python3 /app/meta_ui.py --port 30001
-autostart=true
-autorestart=true
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0