Compare commits

...

94 Commits
v1.0.0 ... main

Author SHA1 Message Date
hailin 29de4e1411 . 2025-09-14 19:12:27 +08:00
hailin c516e234c0 . 2025-09-14 18:46:44 +08:00
hailin 106e5784e2 . 2025-09-14 18:39:52 +08:00
hailin 7669db4b55 . 2025-09-14 18:07:18 +08:00
hailin af007765a3 . 2025-09-03 10:33:08 +08:00
hailin 363c90da1b . 2025-09-03 10:01:21 +08:00
hailin 54fd416073 . 2025-08-01 20:17:23 +08:00
hailin 01ce15ddeb . 2025-08-01 20:15:06 +08:00
hailin aec50e2029 . 2025-08-01 14:41:37 +08:00
hailin 45c24387d9 . 2025-08-01 14:34:13 +08:00
hailin db9e41c3e0 . 2025-08-01 14:27:35 +08:00
hailin f32175aa48 . 2025-08-01 14:25:54 +08:00
hailin effd559734 . 2025-08-01 14:13:26 +08:00
hailin a2cc08abc6 . 2025-08-01 14:04:09 +08:00
hailin e71c4823ef . 2025-08-01 13:53:06 +08:00
hailin ebe7f87009 . 2025-08-01 13:34:30 +08:00
hailin 66b11eb836 . 2025-08-01 13:33:54 +08:00
hailin d2df3af90f . 2025-08-01 11:54:41 +08:00
hailin 47bb4e366e . 2025-08-01 11:43:27 +08:00
hailin 452a2ed902 . 2025-08-01 11:30:15 +08:00
hailin d33a596dfa . 2025-08-01 11:12:44 +08:00
hailin 985871bf02 . 2025-08-01 11:07:47 +08:00
hailin eb6f9ba605 . 2025-08-01 11:00:32 +08:00
hailin 342727753a . 2025-08-01 10:32:10 +08:00
hailin 0b2a49fe2c . 2025-08-01 10:23:29 +08:00
hailin 89053e46ef . 2025-08-01 10:15:09 +08:00
hailin 08e5939764 . 2025-08-01 10:02:15 +08:00
hailin d4823afc81 . 2025-08-01 09:52:03 +08:00
hailin 99a6957d04 . 2025-08-01 09:45:14 +08:00
hailin 7c375562cd . 2025-08-01 09:36:07 +08:00
hailin 26f8dc9ab5 . 2025-08-01 09:28:41 +08:00
hailin f86051512d . 2025-07-31 10:21:30 +08:00
hailin 0b24f7e814 . 2025-07-27 19:37:58 +08:00
hailin 9cb53f50f6 . 2025-07-27 19:32:27 +08:00
hailin 91194df5d8 . 2025-07-27 19:12:14 +08:00
hailin 0ce5191d31 . 2025-07-27 19:07:21 +08:00
hailin 095311d016 . 2025-07-27 18:53:06 +08:00
hailin f904c754e2 . 2025-07-27 18:44:59 +08:00
hailin 79abd2bbdd . 2025-07-27 18:34:25 +08:00
hailin 900be3e02d . 2025-07-27 18:25:42 +08:00
hailin 4bb857f22f . 2025-07-27 18:18:31 +08:00
hailin 44c3814d13 . 2025-07-27 17:24:27 +08:00
hailin 7bdc80cd1e . 2025-07-27 17:16:47 +08:00
hailin 8f12b8269a . 2025-07-27 17:05:54 +08:00
hailin 34c0c43673 . 2025-07-27 16:56:46 +08:00
hailin 6d8fbdc748 . 2025-07-27 16:43:48 +08:00
hailin 244d407937 . 2025-07-27 16:38:15 +08:00
hailin f8a7f93747 . 2025-07-27 16:26:55 +08:00
hailin c912bd2f74 . 2025-07-27 16:07:58 +08:00
hailin 6137a2e0d3 . 2025-07-27 16:05:31 +08:00
hailin 3e8115b036 . 2025-07-27 16:00:36 +08:00
hailin c8c95bd62f . 2025-07-27 15:50:44 +08:00
hailin 871d5994af . 2025-07-27 15:39:59 +08:00
hailin c2b7ec20b8 . 2025-07-27 15:32:23 +08:00
hailin 5d640d814b . 2025-07-27 15:22:59 +08:00
hailin 991f5c81a8 . 2025-07-27 15:21:08 +08:00
hailin 75c97d6423 . 2025-07-27 15:18:38 +08:00
hailin 4559c52759 . 2025-07-27 15:07:55 +08:00
hailin 8c2b8ca785 . 2025-07-27 15:05:16 +08:00
hailin 8282e562ae . 2025-07-27 15:02:47 +08:00
hailin 0b560f7067 . 2025-07-27 15:00:44 +08:00
hailin 82e5957f8e . 2025-07-27 12:42:37 +08:00
hailin d18985e8a3 . 2025-07-27 12:35:49 +08:00
hailin 4071f51150 . 2025-07-27 12:30:04 +08:00
hailin 818a722192 . 2025-07-27 12:29:24 +08:00
hailin 68a12b4b4a . 2025-07-27 12:23:08 +08:00
hailin ccf3398741 . 2025-07-27 12:07:21 +08:00
hailin b42b5f090b . 2025-07-27 11:13:13 +08:00
hailin 0333b8af9c . 2025-07-27 10:52:28 +08:00
hailin f932f0bd5f . 2025-07-27 10:13:06 +08:00
hailin d1a2b815b3 . 2025-07-26 22:19:16 +08:00
hailin 49b8cae1bb . 2025-07-26 16:55:42 +08:00
hailin b70297ece1 . 2025-07-26 16:42:47 +08:00
hailin f0e15aa1d8 . 2025-07-26 08:58:30 +08:00
hailin d2f69be68d . 2025-07-25 17:03:33 +08:00
hailin 6aa0932210 . 2025-07-25 16:48:48 +08:00
hailin 174a6b2d76 . 2025-07-25 16:30:15 +08:00
hailin 2cfc960bc3 . 2025-07-25 16:11:48 +08:00
hailin 222c46ef15 . 2025-07-25 16:05:44 +08:00
hailin 2e621b202d . 2025-07-25 15:33:33 +08:00
hailin b5036d09c3 . 2025-07-25 15:02:17 +08:00
hailin 39c32555d8 . 2025-07-25 14:58:06 +08:00
hailin 6ea2139b82 . 2025-07-25 12:32:56 +08:00
hailin 35ba2eab42 . 2025-07-25 12:19:03 +08:00
hailin f82e6c567f . 2025-07-25 11:48:27 +08:00
hailin 1a58b38c86 . 2025-07-24 13:09:00 +08:00
hailin d795691369 . 2025-07-17 10:54:12 +08:00
hailin e252241910 . 2025-07-16 12:47:43 +08:00
hailin a2a93c7c4c . 2025-07-07 15:24:18 +08:00
hailin c5e4ef4a6d . 2025-07-07 15:21:43 +08:00
hailin 8f6dc142af . 2025-07-07 14:52:40 +08:00
hailin 9ca3ebe4bb . 2025-07-07 14:01:39 +08:00
hailin 1d3223c4ae . 2025-07-04 18:24:57 +08:00
hailin 023d2a0868 . 2025-07-04 17:45:05 +08:00
20 changed files with 2406 additions and 93 deletions

View File

@ -89,8 +89,9 @@ WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
@ -99,6 +100,7 @@ RUN mkdir -p /wheels && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
@ -108,6 +110,9 @@ RUN pip wheel \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
@ -117,7 +122,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils ca-certificates \
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
@ -130,27 +135,35 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# # 安装运行时漏掉的依赖
# RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton
# ✅ 离线安装全部依赖(包含所有运行时必需包)
# RUN python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
# python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
# rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
@ -159,15 +172,20 @@ RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
EXPOSE 30000 30001
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
"--tp", "1", \
"--api-key", "token-abc123"]
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

191
Dockerfile.Qwen3-30B-A3B Normal file
View File

@ -0,0 +1,191 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

View File

@ -0,0 +1,191 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-30B-A3B-Base /root/.cradle/Alibaba/Qwen3-30B-A3B-Base
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

View File

@ -1,63 +0,0 @@
############################################################
# Stage-0: 构建依赖轮子PyTorch + SGLang + sgl_kernel #
############################################################
ARG CUDA_VERSION=12.8.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS builder
# ---- Python 环境 ----
RUN apt-get update && \
apt-get install -y --no-install-recommends python3 python3-pip python3-distutils && \
ln -sf /usr/bin/python3 /usr/bin/python && \
python -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six
# ---- PyTorch / torchvision / SGLang / sgl_kernel ----
ARG TORCH_VER=2.7.1
ARG TV_VER=0.22.1
RUN case "$CUDA_VERSION" in \
12.6.1) CUINDEX=126 ;; \
12.8.1) CUINDEX=128 ;; \
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
esac && \
python -m pip install --no-cache-dir \
torch==${TORCH_VER}+cu${CUINDEX} \
torchvision==${TV_VER}+cu${CUINDEX} \
--extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} && \
python -m pip install --no-cache-dir \
sglang==0.4.8.post1 \
sgl-kernel==0.0.2.post17 \
nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps && \
# ✅ 补全依赖(必须)
python -m pip install --no-cache-dir \
pydantic psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle orjson uvloop sentencepiece
# ✅ 测试模块完整性
#python -c "import sglang, torch, pydantic, transformers, sgl_kernel"
############################################################
# Stage-1: 生成最小运行镜像 #
############################################################
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1
# ---- Python runtime ----
RUN apt-get update && \
apt-get install -y --no-install-recommends python3 python3-distutils && \
ln -sf /usr/bin/python3 /usr/bin/python && \
rm -rf /var/lib/apt/lists/*
# ---- 拷贝 Python 包和入口 ----
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
# ---- 启动服务 ----
EXPOSE 30000
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
"--tp", "1", \
"--api-key", "token-abc123"]

177
Dockerfile.ds_llama_70b Normal file
View File

@ -0,0 +1,177 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \
"--tp", "4", \
"--api-key", "token-abc123", \
"--enable-metrics"]

191
Dockerfile.llm_external Normal file
View File

@ -0,0 +1,191 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

177
Dockerfile.qwen3-14b Normal file
View File

@ -0,0 +1,177 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-14B /root/.cradle/Alibaba/Qwen3-14B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-14B/", \
"--tp", "2", \
"--api-key", "token-abc123", \
"--enable-metrics"]

183
Dockerfile.qwen3-14b-base Normal file
View File

@ -0,0 +1,183 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel gradio requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-14B-Base /root/.cradle/Alibaba/Qwen3-14B-Base
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

177
Dockerfile.qwen3-32b Normal file
View File

@ -0,0 +1,177 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-32B /root/.cradle/Alibaba/Qwen3-32B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-32B/", \
"--tp", "4", \
"--api-key", "token-abc123", \
"--enable-metrics"]

177
Dockerfile.qwen3-8b Normal file
View File

@ -0,0 +1,177 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
"--tp", "1", \
"--api-key", "token-abc123", \
"--enable-metrics"]

177
Dockerfile.qwq32b Normal file
View File

@ -0,0 +1,177 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
COPY --from=builder-extras /wheels /tmp/wheels
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/QwQ-32B /root/.cradle/Alibaba/QwQ-32B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000
# ---- 启动 SGLang 推理服务 ----
CMD ["python3", "-m", "sglang.launch_server", \
"--host", "0.0.0.0", \
"--port", "30000", \
"--model-path", "/root/.cradle/Alibaba/QwQ-32B/", \
"--tp", "4", \
"--api-key", "token-abc123", \
"--enable-metrics"]

191
Dockerfile.tmp Normal file
View File

@ -0,0 +1,191 @@
###############################################################################
# Stage 0 ─ builder-torch编译 PyTorch 2.7.1 (+cu126)
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
ENV USE_CUDA=1 \
USE_DISTRIBUTED=1 \
USE_MPI=1 \
USE_GLOO=1 \
USE_NCCL=1 \
USE_SYSTEM_NCCL=1 \
BUILD_TEST=0
ARG MAX_JOBS=90
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
libopenblas-dev libopenmpi-dev \
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 \
libjpeg-dev libpng-dev ca-certificates && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
WORKDIR /opt
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
WORKDIR /opt/pytorch
ENV MAX_JOBS=${MAX_JOBS}
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
python3 setup.py bdist_wheel
###############################################################################
# Stage 1 ─ builder-extras用自编 Torch 装 TV / flashinfer / sglang并收集轮子
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip python3-distutils python3.10-dev git build-essential \
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
libopenmpi-dev libopenblas-dev\
libnccl2=2.22.3-1+cuda12.6 \
libnccl-dev=2.22.3-1+cuda12.6 && \
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
RUN set -e && \
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
WORKDIR /opt
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
WORKDIR /opt/vision
RUN python3 setup.py bdist_wheel
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
WORKDIR /opt
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
WORKDIR /opt/flashinfer
RUN pip install . && \
python3 -m pip wheel . --no-deps -w dist/
# # ── 安装 vllm跳过编译直接装 ─────────────────────────────────────────────
# WORKDIR /opt
# RUN pip install setuptools wheel setuptools_scm && \
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
# ── 下载 vllm 预编译 wheel避免编译 flash-attn ───────────────────────────────
WORKDIR /opt
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
COPY ./sglang /sgl/sglang
WORKDIR /sgl/sglang/python
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
# ── 🔄 下载 sgl-kernel与 sglang 同步)───────────────────────────────────────
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
RUN mkdir -p /wheels && \
cp /tmp/torch_dist/torch*.whl /wheels/ && \
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
RUN pip wheel \
pydantic orjson psutil pyzmq pynvml \
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
-w /wheels
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
RUN pip wheel "gradio==5.38.2" requests -w /wheels
###############################################################################
# Stage 2 ─ runtime极简运行镜像仅离线安装 wheel
###############################################################################
ARG CUDA_VERSION=12.6.1
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
rm -rf /var/lib/apt/lists/* && \
python3 -m pip install --no-cache-dir --upgrade pip \
&& python3 -m pip install --no-cache-dir xgrammar
# 👉 拷贝 cupti 动态库(避免写死版本号)
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
# 👇建议在后面补上
RUN ldconfig
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
COPY --from=builder-extras /wheels /tmp/wheels
# ✅ 优先装你自编的 torch避免被 PyPI 上的覆盖
RUN ls -lh /tmp/wheels && \
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
rm -rf /tmp/wheels
# ✅ 安装 Prometheus client
RUN python3 -m pip install --no-cache-dir prometheus_client
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
# ✅ 确保目录存在
RUN mkdir -p /tmp/prometheus
# ✅ 添加 Tini推荐
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]
# ---- 拷贝模型(路径可换) ----
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
# ---- 暴露端口 ----
EXPOSE 30000 30001
# 安装 supervisor
RUN apt-get update && apt-get install -y supervisor && \
mkdir -p /etc/supervisor/conf.d
# 拷贝 supervisord 配置文件和 UI 脚本
COPY ./meta_ui.py /app/meta_ui.py
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
# 作为容器主进程运行 supervisor
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]

224
meta_ui.py Normal file
View File

@ -0,0 +1,224 @@
import json, datetime, textwrap, requests, gradio as gr
from pathlib import Path
from collections import deque
import queue, threading, time
# ────────────────── 基础配置 ──────────────────
API_KEY = "token-abc123"
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
def model_name(path: Path):
cfg = path / "config.json"
if cfg.exists():
data = json.load(cfg.open())
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
return path.name
MODEL_NAME = model_name(MODEL_PATH)
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
# ────────────────── 日志队列 ──────────────────
LOG_Q: "queue.Queue[str]" = queue.Queue()
LOG_TXT = ""
def log(msg):
print(msg, flush=True)
LOG_Q.put(msg)
prev_log_value = ""
def consume_logs(dummy=None):
global LOG_TXT, prev_log_value
buf = deque(LOG_TXT.splitlines(), maxlen=400)
while not LOG_Q.empty():
buf.append(LOG_Q.get())
LOG_TXT = "\n".join(buf)
if LOG_TXT != prev_log_value:
prev_log_value = LOG_TXT
return gr.update(value=LOG_TXT)
return gr.update()
# ────────────────── 后端调用 ──────────────────
def backend(text, sampling, api_suffix):
url = f"http://localhost:30000{api_suffix}"
if api_suffix == "/generate":
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
elif api_suffix == "/v1/completions":
payload = {
"model": MODEL_NAME,
"prompt": text,
**sampling
}
elif api_suffix == "/v1/chat/completions":
payload = {
"model": MODEL_NAME,
"messages": text, # ← 这里 text 实际是 messages list
**sampling
}
log(f"\n🟡 [{now()}] POST {url}\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
try:
r = requests.post(url,
headers={"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"},
json=payload, timeout=180)
try:
data = r.json()
except Exception:
data = {}
if api_suffix == "/generate":
txt = data.get("text", "").strip()
meta = data.get("meta_info", {})
fr = meta.get("finish_reason")
ctok = meta.get("completion_tokens")
elif api_suffix == "/v1/completions":
choice = data.get("choices", [{}])[0]
txt = choice.get("text", "").strip()
fr = choice.get("finish_reason")
ctok = data.get("usage", {}).get("completion_tokens")
elif api_suffix == "/v1/chat/completions":
choice = data.get("choices", [{}])[0]
msg = choice.get("message", {})
txt = msg.get("content", "").strip()
# 新增:从 usage 获取 completion_tokens
ctok = data.get("usage", {}).get("completion_tokens")
fr = choice.get("finish_reason") # 如果后续需要 finish reason
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
f"🟢 resp={r.text!r}")
if r.status_code != 200:
return f"[HTTP {r.status_code}] {r.text}"
return txt or "[⚠ 空]"
except Exception as e:
log(f"[❌ 请求异常] {e}")
return f"[❌ 请求异常] {e}"
# ────────────────── Chat 回调 ──────────────────
def chat(
user_msg, history,
max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_raw,
api_suffix, log_state
):
from queue import Queue, Empty
user = user_msg["text"] if isinstance(user_msg, dict) and "text" in user_msg else user_msg
if api_suffix == "/v1/chat/completions":
# 给 LLM 的完整 history用于上下文推理
messages = history[:]
messages.append({"role": "user", "content": user})
prompt_input = messages
else:
prompt_input = user
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
samp = {
("max_tokens" if api_suffix == "/v1/completions" else "max_new_tokens"): int(max_new),
"temperature": temp,
"top_p": top_p,
"top_k": int(top_k),
"repetition_penalty": rep_pen,
"presence_penalty": pres_pen,
**({"stop": stop} if stop else {})
}
result_q = Queue()
def worker():
out = backend(prompt_input, samp, api_suffix)
result_q.put(out)
thread = threading.Thread(target=worker, daemon=True)
thread.start()
if api_suffix == "/v1/chat/completions":
while True:
if not thread.is_alive() and result_q.empty():
break
try:
result = result_q.get(timeout=0.1)
except Empty:
continue
txt = result.strip() if isinstance(result, str) else str(result).strip()
yield {"text": txt}, log_state
return
else:
while thread.is_alive():
try:
result = result_q.get(timeout=0.1)
break
except Empty:
continue
if isinstance(result, str):
result = {"text": result}
elif not isinstance(result, dict) or "text" not in result:
result = {"text": str(result)}
yield result["text"], log_state
return
# ────────────────── Gradio UI ──────────────────
with gr.Blocks(title="调试界面") as demo:
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
with gr.Row():
api_choice = gr.Dropdown(choices=["/generate", "/v1/completions", "/v1/chat/completions"],
value="/generate", label="选择推理接口")
with gr.Row():
max_new = gr.Slider(32, 32768, 1024, label="max_new_tokens")
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
with gr.Row():
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
with gr.Row():
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
log_state = gr.State("")
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False)
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)
chat = gr.ChatInterface(
fn=chat,
additional_inputs=[max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_txt,
api_choice, log_state],
additional_outputs=[log_state],
type="messages"
)
timer = gr.Timer(1.0, render=True)
timer.tick(
fn=consume_logs,
inputs=[],
outputs=[log_box],
)
def clear_all_logs(_):
global LOG_Q, LOG_TXT, prev_log_value
with LOG_Q.mutex:
LOG_Q.queue.clear()
LOG_TXT = ""
prev_log_value = ""
return gr.update(value=""), gr.update(value="")
api_choice.change(fn=clear_all_logs, inputs=api_choice, outputs=[log_state, log_box])
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
demo.launch(server_name="0.0.0.0", server_port=30001)

79
meta_ui.py.old Normal file
View File

@ -0,0 +1,79 @@
import gradio as gr
import requests
API_URL = "http://localhost:30000/v1/completions"
API_KEY = "token-abc123"
MODEL_NAME = "Qwen3-14b-base"
# 构造 promptBase 模型靠拼接上下文
def build_prompt(history, user_message):
prompt = ""
for user, bot in history:
prompt += f"User: {user}\nAssistant: {bot}\n"
prompt += f"User: {user_message}\nAssistant:"
return prompt
# 主对话函数
def chat(user_message, history, max_tokens, temperature):
prompt = build_prompt(history, user_message)
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": MODEL_NAME,
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": temperature,
"stop": ["\nUser:", "\nAssistant:"]
}
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
result = response.json()
reply = result["choices"][0]["text"].strip()
except Exception as e:
reply = f"[请求失败] {e}"
return reply
# 手动测试 API 功能
def test_api_connection(max_tokens, temperature):
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": MODEL_NAME,
"prompt": "Ping?",
"max_tokens": max_tokens,
"temperature": temperature
}
try:
resp = requests.post(API_URL, headers=headers, json=payload, timeout=10)
out = resp.json()["choices"][0]["text"].strip()
return f"✅ API 可用,响应: {out}"
except Exception as e:
return f"❌ API 请求失败: {e}"
# Gradio 控件组合
with gr.Blocks(title="Base 模型测试 UI") as demo:
gr.Markdown("# 💬 Base 模型对话界面")
with gr.Row():
max_tokens = gr.Slider(32, 1024, value=256, label="max_tokens")
temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
test_btn = gr.Button("🔁 测试 API 可用性")
test_output = gr.Textbox(label="API 测试结果", interactive=False)
chatbot = gr.ChatInterface(
fn=lambda msg, hist: chat(msg, hist, max_tokens.value, temperature.value),
title=None
)
test_btn.click(fn=test_api_connection, inputs=[max_tokens, temperature], outputs=test_output)
# 启动服务
demo.launch(server_name="0.0.0.0", server_port=30001)

153
meta_ui_old.py Normal file
View File

@ -0,0 +1,153 @@
import json, datetime, textwrap, requests, gradio as gr
from pathlib import Path
from collections import deque
import queue, threading, time
# ───────────────────── 基础配置 ─────────────────────
API_URL = "http://localhost:30000/generate"
API_KEY = "token-abc123"
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
def model_name(path: Path):
cfg = path / "config.json"
if cfg.exists():
data = json.load(cfg.open())
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
return path.name
MODEL_NAME = model_name(MODEL_PATH)
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
# ───────────────────── 日志队列 ─────────────────────
LOG_Q: "queue.Queue[str]" = queue.Queue()
LOG_TXT = "" # ✅ 全局日志缓存,避免 chat 焦点阻断 log_box 更新
def log(msg): # 写终端 + 推队列
print(msg, flush=True)
LOG_Q.put(msg)
prev_log_value = "" # 上一帧的日志内容
def consume_logs(dummy=None):
"""每秒更新 log_box 内容,避免 chat 阻塞 UI 刷新"""
global LOG_TXT, prev_log_value
buf = deque(LOG_TXT.splitlines(), maxlen=400)
while not LOG_Q.empty():
buf.append(LOG_Q.get())
LOG_TXT = "\n".join(buf)
if LOG_TXT != prev_log_value:
prev_log_value = LOG_TXT
return gr.update(value=LOG_TXT)
return gr.update() # 无更新则不触发前端刷新
# ───────────────────── 后端调用 ─────────────────────
def backend(text, sampling):
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
log(f"\n🟡 [{now()}] payload\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
try:
r = requests.post(API_URL,
headers={"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"},
json=payload, timeout=180)
try:
data = r.json()
except Exception:
data = {}
fr = data.get("meta_info", {}).get("finish_reason")
ctok = data.get("meta_info", {}).get("completion_tokens")
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
f"🟢 resp800={r.text[:800]!r}")
if r.status_code != 200:
return f"[HTTP {r.status_code}] {r.text[:300]}"
return data.get("text", "").strip() or "[⚠ 空]"
except Exception as e:
log(f"[❌ 请求异常] {e}")
return f"[❌ 请求异常] {e}"
# ───────────────────── Chat 回调 ─────────────────────
def chat(
user, history,
max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_raw,
log_state
):
import threading
from queue import Queue, Empty
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
samp = {
"max_new_tokens": int(max_new),
"temperature": temp,
"top_p": top_p,
"top_k": int(top_k),
"repetition_penalty": rep_pen,
"presence_penalty": pres_pen,
**({"stop": stop} if stop else {})
}
result_q = Queue()
# 后台线程执行 backend 推理
def worker():
out = backend(user, samp)
result_q.put(out)
thread = threading.Thread(target=worker)
thread.start()
# 先返回提示
yield "⏳ 正在生成中...", log_state
# 每 0.1 秒轮询结果队列(避免阻塞 UI
while thread.is_alive() or not result_q.empty():
try:
result = result_q.get(timeout=0.1)
yield result, log_state
except Empty:
continue
# ───────────────────── Gradio UI ─────────────────────
with gr.Blocks(title="调试界面") as demo:
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
# 采样参数控件
with gr.Row():
max_new = gr.Slider(32, 32768, 128, label="max_new_tokens")
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
with gr.Row():
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
with gr.Row():
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
log_state = gr.State("") # 状态透传
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False) # ✅ 默认关闭
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False) # ✅ 默认隐藏
# Chat 界面(移到日志之前)
chatbot = gr.ChatInterface(
fn=chat,
additional_inputs=[max_new, temp, top_p, top_k,
rep_pen, pres_pen, stop_txt, log_state],
additional_outputs=[log_state],
type="messages"
)
# 日志刷新定时器
timer = gr.Timer(1.0, render=True)
timer.tick(
fn=consume_logs,
inputs=[],
outputs=[log_box],
)
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
demo.launch(server_name="0.0.0.0", server_port=30001)

View File

@ -0,0 +1,10 @@
{
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
}
}

View File

@ -216,9 +216,13 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
@app.get("/health")
async def health() -> Response:
"""Check the health of the http server."""
return Response(status_code=200)
async def health():
"""Check the health of the http server and return version info."""
return {
"status": "ok",
"name": "sglang_0.4.8.post1",
"version": "v1.0.0" # 这里写上你希望显示的版本号
}
@app.get("/health_generate")

View File

@ -868,12 +868,22 @@ def set_ulimit(target_soft_limit=65535):
def add_api_key_middleware(app, api_key: str):
@app.middleware("http")
async def authentication(request, call_next):
# OPTIONS 请求CORS 预检)直接放行
if request.method == "OPTIONS":
return await call_next(request)
if request.url.path.startswith("/health"):
return await call_next(request)
if request.url.path.startswith("/metrics"):
# 明确列出无需鉴权的路径前缀
whitelist_prefixes = (
"/health",
"/metrics",
"/ping",
"/get_model_info",
)
if any(request.url.path.startswith(prefix) for prefix in whitelist_prefixes):
return await call_next(request)
# Bearer Token 鉴权
if request.headers.get("Authorization") != "Bearer " + api_key:
return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
return await call_next(request)

23
supervisord.conf Normal file
View File

@ -0,0 +1,23 @@
[supervisord]
nodaemon=true
logfile=/dev/stdout
logfile_maxbytes=0
loglevel=info
[program:sglang]
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:ui]
command=python3 /app/meta_ui.py --port 30001
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0

View File

@ -0,0 +1,23 @@
[supervisord]
nodaemon=true
logfile=/dev/stdout
logfile_maxbytes=0
loglevel=info
[program:sglang]
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/Alibaba/Qwen3-30B-A3B/ --tp 4 --api-key token-abc123 --enable-metrics
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:ui]
command=python3 /app/meta_ui.py --port 30001
autostart=true
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0