Compare commits

..

No commits in common. "main" and "v1.0.0" have entirely different histories.
main ... v1.0.0

20 changed files with 92 additions and 2405 deletions

View File

@ -89,9 +89,8 @@ WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
@ -100,7 +99,6 @@ RUN mkdir -p /wheels && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
@ -110,9 +108,6 @@ RUN pip wheel \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
@ -122,7 +117,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
python3 python3-dev python3-pip python3-distutils ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
@ -135,35 +130,27 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# # 安装运行时漏掉的依赖
# RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 离线安装全部依赖(包含所有运行时必需包)
# RUN python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
# python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
# rm -rf /tmp/wheels
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
@ -172,20 +159,15 @@ RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
# ---- 暴露端口 ----
EXPOSE 30000 30001
EXPOSE 30000
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
"--tp", "1", \
"--api-key", "token-abc123"]

View File

@ -1,191 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

View File

@ -1,191 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-30B-A3B-Base /root/.cradle/Alibaba/Qwen3-30B-A3B-Base
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

63
Dockerfile.bad Normal file
View File

@ -0,0 +1,63 @@
############################################################
# Stage-0: 构建依赖轮子PyTorch + SGLang + sgl_kernel #
############################################################
ARG CUDA_VERSION=12.8.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS builder
# ---- Python 环境 ----
RUN apt-get update && \
apt-get install -y --no-install-recommends python3 python3-pip python3-distutils && \
ln -sf /usr/bin/python3 /usr/bin/python && \
python -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six
# ---- PyTorch / torchvision / SGLang / sgl_kernel ----
ARG TORCH_VER=2.7.1
ARG TV_VER=0.22.1
RUN case "$CUDA_VERSION" in \
12.6.1) CUINDEX=126 ;; \
12.8.1) CUINDEX=128 ;; \
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
esac && \
python -m pip install --no-cache-dir \
torch==${TORCH_VER}+cu${CUINDEX} \
torchvision==${TV_VER}+cu${CUINDEX} \
--extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} && \
python -m pip install --no-cache-dir \
sglang==0.4.8.post1 \
sgl-kernel==0.0.2.post17 \
nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps && \
# ✅ 补全依赖(必须)
python -m pip install --no-cache-dir \
pydantic psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle orjson uvloop sentencepiece
# ✅ 测试模块完整性
#python -c "import sglang, torch, pydantic, transformers, sgl_kernel"
############################################################
# Stage-1: 生成最小运行镜像 #
############################################################
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1
# ---- Python runtime ----
RUN apt-get update && \
apt-get install -y --no-install-recommends python3 python3-distutils && \
ln -sf /usr/bin/python3 /usr/bin/python && \
rm -rf /var/lib/apt/lists/*
# ---- 拷贝 Python 包和入口 ----
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
# ---- 启动服务 ----
EXPOSE 30000
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
"--tp", "1", \
"--api-key", "token-abc123"]

View File

@ -1,177 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \
"--tp", "4", \
"--api-key", "token-abc123", \
"--enable-metrics"]

View File

@ -1,191 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

View File

@ -1,177 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-14B /root/.cradle/Alibaba/Qwen3-14B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-14B/", \
"--tp", "2", \
"--api-key", "token-abc123", \
"--enable-metrics"]

View File

@ -1,183 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel gradio requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-14B-Base /root/.cradle/Alibaba/Qwen3-14B-Base
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

View File

@ -1,177 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-32B /root/.cradle/Alibaba/Qwen3-32B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-32B/", \
"--tp", "4", \
"--api-key", "token-abc123", \
"--enable-metrics"]

View File

@ -1,177 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
"--tp", "1", \
"--api-key", "token-abc123", \
"--enable-metrics"]

View File

@ -1,177 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/QwQ-32B /root/.cradle/Alibaba/QwQ-32B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/QwQ-32B/", \
"--tp", "4", \
"--api-key", "token-abc123", \
"--enable-metrics"]

View File

@ -1,191 +0,0 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

View File

@ -1,224 +0,0 @@
import json, datetime, textwrap, requests, gradio as gr
from pathlib import Path
from collections import deque
import queue, threading, time
# ────────────────── 基础配置 ──────────────────
API_KEY = "token-abc123"
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
def model_name(path: Path):
cfg = path / "config.json"
if cfg.exists():
data = json.load(cfg.open())
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
return path.name
MODEL_NAME = model_name(MODEL_PATH)
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
# ────────────────── 日志队列 ──────────────────
LOG_Q: "queue.Queue[str]" = queue.Queue()
LOG_TXT = ""
def log(msg):
print(msg, flush=True)
LOG_Q.put(msg)
prev_log_value = ""
def consume_logs(dummy=None):
global LOG_TXT, prev_log_value
buf = deque(LOG_TXT.splitlines(), maxlen=400)
while not LOG_Q.empty():
buf.append(LOG_Q.get())
LOG_TXT = "\n".join(buf)
if LOG_TXT != prev_log_value:
prev_log_value = LOG_TXT
return gr.update(value=LOG_TXT)
return gr.update()
# ────────────────── 后端调用 ──────────────────
def backend(text, sampling, api_suffix):
url = f"http://localhost:30000{api_suffix}"
if api_suffix == "/generate":
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
elif api_suffix == "/v1/completions":
payload = {
"model": MODEL_NAME,
"prompt": text,
**sampling
}
elif api_suffix == "/v1/chat/completions":
payload = {
"model": MODEL_NAME,
"messages": text, # ← 这里 text 实际是 messages list
**sampling
}
log(f"\n🟡 [{now()}] POST {url}\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
try:
r = requests.post(url,
headers={"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"},
json=payload, timeout=180)
try:
data = r.json()
except Exception:
data = {}
if api_suffix == "/generate":
txt = data.get("text", "").strip()
meta = data.get("meta_info", {})
fr = meta.get("finish_reason")
ctok = meta.get("completion_tokens")
elif api_suffix == "/v1/completions":
choice = data.get("choices", [{}])[0]
txt = choice.get("text", "").strip()
fr = choice.get("finish_reason")
ctok = data.get("usage", {}).get("completion_tokens")
elif api_suffix == "/v1/chat/completions":
choice = data.get("choices", [{}])[0]
msg = choice.get("message", {})
txt = msg.get("content", "").strip()
# 新增:从 usage 获取 completion_tokens
ctok = data.get("usage", {}).get("completion_tokens")
fr = choice.get("finish_reason") # 如果后续需要 finish reason
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
f"🟢 resp={r.text!r}")
if r.status_code != 200:
return f"[HTTP {r.status_code}] {r.text}"
return txt or "[⚠ 空]"
except Exception as e:
log(f"[❌ 请求异常] {e}")
return f"[❌ 请求异常] {e}"
# ────────────────── Chat 回调 ──────────────────
def chat(
user_msg, history,
max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_raw,
api_suffix, log_state
):
from queue import Queue, Empty
user = user_msg["text"] if isinstance(user_msg, dict) and "text" in user_msg else user_msg
if api_suffix == "/v1/chat/completions":
# 给 LLM 的完整 history用于上下文推理
messages = history[:]
messages.append({"role": "user", "content": user})
prompt_input = messages
else:
prompt_input = user
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
samp = {
("max_tokens" if api_suffix == "/v1/completions" else "max_new_tokens"): int(max_new),
"temperature": temp,
"top_p": top_p,
"top_k": int(top_k),
"repetition_penalty": rep_pen,
"presence_penalty": pres_pen,
**({"stop": stop} if stop else {})
}
result_q = Queue()
def worker():
out = backend(prompt_input, samp, api_suffix)
result_q.put(out)
thread = threading.Thread(target=worker, daemon=True)
thread.start()
if api_suffix == "/v1/chat/completions":
while True:
if not thread.is_alive() and result_q.empty():
break
try:
result = result_q.get(timeout=0.1)
except Empty:
continue
txt = result.strip() if isinstance(result, str) else str(result).strip()
yield {"text": txt}, log_state
return
else:
while thread.is_alive():
try:
result = result_q.get(timeout=0.1)
break
except Empty:
continue
if isinstance(result, str):
result = {"text": result}
elif not isinstance(result, dict) or "text" not in result:
result = {"text": str(result)}
yield result["text"], log_state
return
# ────────────────── Gradio UI ──────────────────
with gr.Blocks(title="调试界面") as demo:
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
with gr.Row():
api_choice = gr.Dropdown(choices=["/generate", "/v1/completions", "/v1/chat/completions"],
value="/generate", label="选择推理接口")
with gr.Row():
max_new = gr.Slider(32, 32768, 1024, label="max_new_tokens")
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
with gr.Row():
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
with gr.Row():
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
log_state = gr.State("")
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False)
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)
chat = gr.ChatInterface(
fn=chat,
additional_inputs=[max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_txt,
api_choice, log_state],
additional_outputs=[log_state],
type="messages"
)
timer = gr.Timer(1.0, render=True)
timer.tick(
fn=consume_logs,
inputs=[],
outputs=[log_box],
)
def clear_all_logs(_):
global LOG_Q, LOG_TXT, prev_log_value
with LOG_Q.mutex:
LOG_Q.queue.clear()
LOG_TXT = ""
prev_log_value = ""
return gr.update(value=""), gr.update(value="")
api_choice.change(fn=clear_all_logs, inputs=api_choice, outputs=[log_state, log_box])
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
demo.launch(server_name="0.0.0.0", server_port=30001)

View File

@ -1,79 +0,0 @@
import gradio as gr
import requests
API_URL = "http://localhost:30000/v1/completions"
API_KEY = "token-abc123"
MODEL_NAME = "Qwen3-14b-base"
# 构造 promptBase 模型靠拼接上下文
def build_prompt(history, user_message):
prompt = ""
for user, bot in history:
prompt += f"User: {user}\nAssistant: {bot}\n"
prompt += f"User: {user_message}\nAssistant:"
return prompt
# 主对话函数
def chat(user_message, history, max_tokens, temperature):
prompt = build_prompt(history, user_message)
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": MODEL_NAME,
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": temperature,
"stop": ["\nUser:", "\nAssistant:"]
}
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
result = response.json()
reply = result["choices"][0]["text"].strip()
except Exception as e:
reply = f"[请求失败] {e}"
return reply
# 手动测试 API 功能
def test_api_connection(max_tokens, temperature):
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": MODEL_NAME,
"prompt": "Ping?",
"max_tokens": max_tokens,
"temperature": temperature
}
try:
resp = requests.post(API_URL, headers=headers, json=payload, timeout=10)
out = resp.json()["choices"][0]["text"].strip()
return f"✅ API 可用,响应: {out}"
except Exception as e:
return f"❌ API 请求失败: {e}"
# Gradio 控件组合
with gr.Blocks(title="Base 模型测试 UI") as demo:
gr.Markdown("# 💬 Base 模型对话界面")
with gr.Row():
max_tokens = gr.Slider(32, 1024, value=256, label="max_tokens")
temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
test_btn = gr.Button("🔁 测试 API 可用性")
test_output = gr.Textbox(label="API 测试结果", interactive=False)
chatbot = gr.ChatInterface(
fn=lambda msg, hist: chat(msg, hist, max_tokens.value, temperature.value),
title=None
)
test_btn.click(fn=test_api_connection, inputs=[max_tokens, temperature], outputs=test_output)
# 启动服务
demo.launch(server_name="0.0.0.0", server_port=30001)

View File

@ -1,153 +0,0 @@
import json, datetime, textwrap, requests, gradio as gr
from pathlib import Path
from collections import deque
import queue, threading, time
# ───────────────────── 基础配置 ─────────────────────
API_URL = "http://localhost:30000/generate"
API_KEY = "token-abc123"
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
def model_name(path: Path):
cfg = path / "config.json"
if cfg.exists():
data = json.load(cfg.open())
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
return path.name
MODEL_NAME = model_name(MODEL_PATH)
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
# ───────────────────── 日志队列 ─────────────────────
LOG_Q: "queue.Queue[str]" = queue.Queue()
LOG_TXT = "" # ✅ 全局日志缓存,避免 chat 焦点阻断 log_box 更新
def log(msg): # 写终端 + 推队列
print(msg, flush=True)
LOG_Q.put(msg)
prev_log_value = "" # 上一帧的日志内容
def consume_logs(dummy=None):
"""每秒更新 log_box 内容,避免 chat 阻塞 UI 刷新"""
global LOG_TXT, prev_log_value
buf = deque(LOG_TXT.splitlines(), maxlen=400)
while not LOG_Q.empty():
buf.append(LOG_Q.get())
LOG_TXT = "\n".join(buf)
if LOG_TXT != prev_log_value:
prev_log_value = LOG_TXT
return gr.update(value=LOG_TXT)
return gr.update() # 无更新则不触发前端刷新
# ───────────────────── 后端调用 ─────────────────────
def backend(text, sampling):
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
log(f"\n🟡 [{now()}] payload\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
try:
r = requests.post(API_URL,
headers={"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"},
json=payload, timeout=180)
try:
data = r.json()
except Exception:
data = {}
fr = data.get("meta_info", {}).get("finish_reason")
ctok = data.get("meta_info", {}).get("completion_tokens")
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
f"🟢 resp800={r.text[:800]!r}")
if r.status_code != 200:
return f"[HTTP {r.status_code}] {r.text[:300]}"
return data.get("text", "").strip() or "[⚠ 空]"
except Exception as e:
log(f"[❌ 请求异常] {e}")
return f"[❌ 请求异常] {e}"
# ───────────────────── Chat 回调 ─────────────────────
def chat(
user, history,
max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_raw,
log_state
):
import threading
from queue import Queue, Empty
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
samp = {
"max_new_tokens": int(max_new),
"temperature": temp,
"top_p": top_p,
"top_k": int(top_k),
"repetition_penalty": rep_pen,
"presence_penalty": pres_pen,
**({"stop": stop} if stop else {})
}
result_q = Queue()
# 后台线程执行 backend 推理
def worker():
out = backend(user, samp)
result_q.put(out)
thread = threading.Thread(target=worker)
thread.start()
# 先返回提示
yield "⏳ 正在生成中...", log_state
# 每 0.1 秒轮询结果队列(避免阻塞 UI
while thread.is_alive() or not result_q.empty():
try:
result = result_q.get(timeout=0.1)
yield result, log_state
except Empty:
continue
# ───────────────────── Gradio UI ─────────────────────
with gr.Blocks(title="调试界面") as demo:
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
# 采样参数控件
with gr.Row():
max_new = gr.Slider(32, 32768, 128, label="max_new_tokens")
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
with gr.Row():
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
with gr.Row():
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
log_state = gr.State("") # 状态透传
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False) # ✅ 默认关闭
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False) # ✅ 默认隐藏
# Chat 界面(移到日志之前)
chatbot = gr.ChatInterface(
fn=chat,
additional_inputs=[max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_txt, log_state],
additional_outputs=[log_state],
type="messages"
)
# 日志刷新定时器
timer = gr.Timer(1.0, render=True)
timer.tick(
fn=consume_logs,
inputs=[],
outputs=[log_box],
)
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
demo.launch(server_name="0.0.0.0", server_port=30001)

View File

@ -1,10 +0,0 @@
{
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
}
}

View File

@ -216,13 +216,9 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
@app.get("/health")
async def health():
"""Check the health of the http server and return version info."""
return {
"status": "ok",
"name": "sglang_0.4.8.post1",
"version": "v1.0.0" # 这里写上你希望显示的版本号
}
async def health() -> Response:
"""Check the health of the http server."""
return Response(status_code=200)
@app.get("/health_generate")

View File

@ -868,22 +868,12 @@ def set_ulimit(target_soft_limit=65535):
def add_api_key_middleware(app, api_key: str):
@app.middleware("http")
async def authentication(request, call_next):
# OPTIONS 请求CORS 预检)直接放行
if request.method == "OPTIONS":
return await call_next(request)
# 明确列出无需鉴权的路径前缀
whitelist_prefixes = (
"/health",
"/metrics",
"/ping",
"/get_model_info",
)
if any(request.url.path.startswith(prefix) for prefix in whitelist_prefixes):
if request.url.path.startswith("/health"):
return await call_next(request)
if request.url.path.startswith("/metrics"):
return await call_next(request)
# Bearer Token 鉴权
if request.headers.get("Authorization") != "Bearer " + api_key:
return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
return await call_next(request)

View File

@ -1,23 +0,0 @@
[supervisord]
nodaemon=true
logfile=/dev/stdout
logfile_maxbytes=0
loglevel=info
[program:sglang]
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:ui]
command=python3 /app/meta_ui.py --port 30001
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0

View File

@ -1,23 +0,0 @@
[supervisord]
nodaemon=true
logfile=/dev/stdout
logfile_maxbytes=0
loglevel=info
[program:sglang]
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/Alibaba/Qwen3-30B-A3B/ --tp 4 --api-key token-abc123 --enable-metrics
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:ui]
command=python3 /app/meta_ui.py --port 30001
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0