Compare commits
94 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
29de4e1411 | |
|
|
c516e234c0 | |
|
|
106e5784e2 | |
|
|
7669db4b55 | |
|
|
af007765a3 | |
|
|
363c90da1b | |
|
|
54fd416073 | |
|
|
01ce15ddeb | |
|
|
aec50e2029 | |
|
|
45c24387d9 | |
|
|
db9e41c3e0 | |
|
|
f32175aa48 | |
|
|
effd559734 | |
|
|
a2cc08abc6 | |
|
|
e71c4823ef | |
|
|
ebe7f87009 | |
|
|
66b11eb836 | |
|
|
d2df3af90f | |
|
|
47bb4e366e | |
|
|
452a2ed902 | |
|
|
d33a596dfa | |
|
|
985871bf02 | |
|
|
eb6f9ba605 | |
|
|
342727753a | |
|
|
0b2a49fe2c | |
|
|
89053e46ef | |
|
|
08e5939764 | |
|
|
d4823afc81 | |
|
|
99a6957d04 | |
|
|
7c375562cd | |
|
|
26f8dc9ab5 | |
|
|
f86051512d | |
|
|
0b24f7e814 | |
|
|
9cb53f50f6 | |
|
|
91194df5d8 | |
|
|
0ce5191d31 | |
|
|
095311d016 | |
|
|
f904c754e2 | |
|
|
79abd2bbdd | |
|
|
900be3e02d | |
|
|
4bb857f22f | |
|
|
44c3814d13 | |
|
|
7bdc80cd1e | |
|
|
8f12b8269a | |
|
|
34c0c43673 | |
|
|
6d8fbdc748 | |
|
|
244d407937 | |
|
|
f8a7f93747 | |
|
|
c912bd2f74 | |
|
|
6137a2e0d3 | |
|
|
3e8115b036 | |
|
|
c8c95bd62f | |
|
|
871d5994af | |
|
|
c2b7ec20b8 | |
|
|
5d640d814b | |
|
|
991f5c81a8 | |
|
|
75c97d6423 | |
|
|
4559c52759 | |
|
|
8c2b8ca785 | |
|
|
8282e562ae | |
|
|
0b560f7067 | |
|
|
82e5957f8e | |
|
|
d18985e8a3 | |
|
|
4071f51150 | |
|
|
818a722192 | |
|
|
68a12b4b4a | |
|
|
ccf3398741 | |
|
|
b42b5f090b | |
|
|
0333b8af9c | |
|
|
f932f0bd5f | |
|
|
d1a2b815b3 | |
|
|
49b8cae1bb | |
|
|
b70297ece1 | |
|
|
f0e15aa1d8 | |
|
|
d2f69be68d | |
|
|
6aa0932210 | |
|
|
174a6b2d76 | |
|
|
2cfc960bc3 | |
|
|
222c46ef15 | |
|
|
2e621b202d | |
|
|
b5036d09c3 | |
|
|
39c32555d8 | |
|
|
6ea2139b82 | |
|
|
35ba2eab42 | |
|
|
f82e6c567f | |
|
|
1a58b38c86 | |
|
|
d795691369 | |
|
|
e252241910 | |
|
|
a2a93c7c4c | |
|
|
c5e4ef4a6d | |
|
|
8f6dc142af | |
|
|
9ca3ebe4bb | |
|
|
1d3223c4ae | |
|
|
023d2a0868 |
66
Dockerfile
66
Dockerfile
|
|
@ -89,8 +89,9 @@ WORKDIR /sgl/sglang/python
|
||||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
|
||||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||||
|
|
||||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
RUN mkdir -p /wheels && \
|
RUN mkdir -p /wheels && \
|
||||||
|
|
@ -99,6 +100,7 @@ RUN mkdir -p /wheels && \
|
||||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
|
@ -108,6 +110,9 @@ RUN pip wheel \
|
||||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
-w /wheels
|
-w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||||
|
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
@ -117,7 +122,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
python3 python3-dev python3-pip python3-distutils ca-certificates \
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
rm -rf /var/lib/apt/lists/* && \
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
|
@ -130,27 +135,35 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
|
||||||
# 👇建议在后面补上
|
# 👇建议在后面补上
|
||||||
RUN ldconfig
|
RUN ldconfig
|
||||||
|
|
||||||
COPY --from=builder-extras /wheels /tmp/wheels
|
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||||
|
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
|
||||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
|
||||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
RUN ls -lh /tmp/wheels && \
|
RUN ls -lh /tmp/wheels && \
|
||||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||||
|
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||||
rm -rf /tmp/wheels
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
# # 安装运行时漏掉的依赖
|
|
||||||
# RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton
|
|
||||||
|
|
||||||
# ✅ 离线安装全部依赖(包含所有运行时必需包)
|
|
||||||
# RUN python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
# ✅ 安装 Prometheus client
|
||||||
# python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
# rm -rf /tmp/wheels
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
# ✅ 添加 Tini(推荐)
|
# ✅ 添加 Tini(推荐)
|
||||||
ENV TINI_VERSION=v0.19.0
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
|
@ -159,15 +172,20 @@ RUN chmod +x /tini
|
||||||
ENTRYPOINT ["/tini", "--"]
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
# ---- 拷贝模型(路径可换) ----
|
# ---- 拷贝模型(路径可换) ----
|
||||||
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
|
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
# ---- 暴露端口 ----
|
# ---- 暴露端口 ----
|
||||||
EXPOSE 30000
|
EXPOSE 30000 30001
|
||||||
|
|
||||||
# ---- 启动 SGLang 推理服务 ----
|
# 安装 supervisor
|
||||||
CMD ["python3", "-m", "sglang.launch_server", \
|
RUN apt-get update && apt-get install -y supervisor && \
|
||||||
"--host", "0.0.0.0", \
|
mkdir -p /etc/supervisor/conf.d
|
||||||
"--port", "30000", \
|
|
||||||
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
|
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||||
"--tp", "1", \
|
COPY ./meta_ui.py /app/meta_ui.py
|
||||||
"--api-key", "token-abc123"]
|
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||||
|
|
||||||
|
# 作为容器主进程运行 supervisor
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||||
|
|
@ -0,0 +1,191 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||||
|
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||||
|
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||||
|
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||||
|
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000 30001
|
||||||
|
|
||||||
|
# 安装 supervisor
|
||||||
|
RUN apt-get update && apt-get install -y supervisor && \
|
||||||
|
mkdir -p /etc/supervisor/conf.d
|
||||||
|
|
||||||
|
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||||
|
COPY ./meta_ui.py /app/meta_ui.py
|
||||||
|
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||||
|
|
||||||
|
# 作为容器主进程运行 supervisor
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||||
|
|
@ -0,0 +1,191 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||||
|
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||||
|
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||||
|
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||||
|
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/Qwen3-30B-A3B-Base /root/.cradle/Alibaba/Qwen3-30B-A3B-Base
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000 30001
|
||||||
|
|
||||||
|
# 安装 supervisor
|
||||||
|
RUN apt-get update && apt-get install -y supervisor && \
|
||||||
|
mkdir -p /etc/supervisor/conf.d
|
||||||
|
|
||||||
|
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||||
|
COPY ./meta_ui.py /app/meta_ui.py
|
||||||
|
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||||
|
|
||||||
|
# 作为容器主进程运行 supervisor
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||||
|
|
@ -1,63 +0,0 @@
|
||||||
############################################################
|
|
||||||
# Stage-0: 构建依赖轮子(PyTorch + SGLang + sgl_kernel) #
|
|
||||||
############################################################
|
|
||||||
ARG CUDA_VERSION=12.8.1
|
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS builder
|
|
||||||
|
|
||||||
# ---- Python 环境 ----
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends python3 python3-pip python3-distutils && \
|
|
||||||
ln -sf /usr/bin/python3 /usr/bin/python && \
|
|
||||||
python -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six
|
|
||||||
|
|
||||||
# ---- PyTorch / torchvision / SGLang / sgl_kernel ----
|
|
||||||
ARG TORCH_VER=2.7.1
|
|
||||||
ARG TV_VER=0.22.1
|
|
||||||
RUN case "$CUDA_VERSION" in \
|
|
||||||
12.6.1) CUINDEX=126 ;; \
|
|
||||||
12.8.1) CUINDEX=128 ;; \
|
|
||||||
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
|
|
||||||
esac && \
|
|
||||||
python -m pip install --no-cache-dir \
|
|
||||||
torch==${TORCH_VER}+cu${CUINDEX} \
|
|
||||||
torchvision==${TV_VER}+cu${CUINDEX} \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} && \
|
|
||||||
python -m pip install --no-cache-dir \
|
|
||||||
sglang==0.4.8.post1 \
|
|
||||||
sgl-kernel==0.0.2.post17 \
|
|
||||||
nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps && \
|
|
||||||
# ✅ 补全依赖(必须)
|
|
||||||
python -m pip install --no-cache-dir \
|
|
||||||
pydantic psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle orjson uvloop sentencepiece
|
|
||||||
# ✅ 测试模块完整性
|
|
||||||
#python -c "import sglang, torch, pydantic, transformers, sgl_kernel"
|
|
||||||
|
|
||||||
############################################################
|
|
||||||
# Stage-1: 生成最小运行镜像 #
|
|
||||||
############################################################
|
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive \
|
|
||||||
PYTHONUNBUFFERED=1
|
|
||||||
|
|
||||||
# ---- Python runtime ----
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends python3 python3-distutils && \
|
|
||||||
ln -sf /usr/bin/python3 /usr/bin/python && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# ---- 拷贝 Python 包和入口 ----
|
|
||||||
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
|
|
||||||
COPY --from=builder /usr/local/bin /usr/local/bin
|
|
||||||
|
|
||||||
# ---- 拷贝模型(路径可换) ----
|
|
||||||
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
|
|
||||||
|
|
||||||
# ---- 启动服务 ----
|
|
||||||
EXPOSE 30000
|
|
||||||
CMD ["python3", "-m", "sglang.launch_server", \
|
|
||||||
"--host", "0.0.0.0", \
|
|
||||||
"--port", "30000", \
|
|
||||||
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
|
|
||||||
"--tp", "1", \
|
|
||||||
"--api-key", "token-abc123"]
|
|
||||||
|
|
@ -0,0 +1,177 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||||
|
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000
|
||||||
|
|
||||||
|
# ---- 启动 SGLang 推理服务 ----
|
||||||
|
CMD ["python3", "-m", "sglang.launch_server", \
|
||||||
|
"--host", "0.0.0.0", \
|
||||||
|
"--port", "30000", \
|
||||||
|
"--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \
|
||||||
|
"--tp", "4", \
|
||||||
|
"--api-key", "token-abc123", \
|
||||||
|
"--enable-metrics"]
|
||||||
|
|
@ -0,0 +1,191 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||||
|
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||||
|
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||||
|
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||||
|
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000 30001
|
||||||
|
|
||||||
|
# 安装 supervisor
|
||||||
|
RUN apt-get update && apt-get install -y supervisor && \
|
||||||
|
mkdir -p /etc/supervisor/conf.d
|
||||||
|
|
||||||
|
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||||
|
COPY ./meta_ui.py /app/meta_ui.py
|
||||||
|
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||||
|
|
||||||
|
# 作为容器主进程运行 supervisor
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||||
|
|
@ -0,0 +1,177 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||||
|
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/Qwen3-14B /root/.cradle/Alibaba/Qwen3-14B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000
|
||||||
|
|
||||||
|
# ---- 启动 SGLang 推理服务 ----
|
||||||
|
CMD ["python3", "-m", "sglang.launch_server", \
|
||||||
|
"--host", "0.0.0.0", \
|
||||||
|
"--port", "30000", \
|
||||||
|
"--model-path", "/root/.cradle/Alibaba/Qwen3-14B/", \
|
||||||
|
"--tp", "2", \
|
||||||
|
"--api-key", "token-abc123", \
|
||||||
|
"--enable-metrics"]
|
||||||
|
|
@ -0,0 +1,183 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||||
|
RUN pip wheel gradio requests -w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||||
|
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/Qwen3-14B-Base /root/.cradle/Alibaba/Qwen3-14B-Base
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000 30001
|
||||||
|
|
||||||
|
# 安装 supervisor
|
||||||
|
RUN apt-get update && apt-get install -y supervisor && \
|
||||||
|
mkdir -p /etc/supervisor/conf.d
|
||||||
|
|
||||||
|
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||||
|
COPY ./meta_ui.py /app/meta_ui.py
|
||||||
|
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||||
|
|
||||||
|
# 作为容器主进程运行 supervisor
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||||
|
|
@ -0,0 +1,177 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||||
|
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/Qwen3-32B /root/.cradle/Alibaba/Qwen3-32B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000
|
||||||
|
|
||||||
|
# ---- 启动 SGLang 推理服务 ----
|
||||||
|
CMD ["python3", "-m", "sglang.launch_server", \
|
||||||
|
"--host", "0.0.0.0", \
|
||||||
|
"--port", "30000", \
|
||||||
|
"--model-path", "/root/.cradle/Alibaba/Qwen3-32B/", \
|
||||||
|
"--tp", "4", \
|
||||||
|
"--api-key", "token-abc123", \
|
||||||
|
"--enable-metrics"]
|
||||||
|
|
@ -0,0 +1,177 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||||
|
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000
|
||||||
|
|
||||||
|
# ---- 启动 SGLang 推理服务 ----
|
||||||
|
CMD ["python3", "-m", "sglang.launch_server", \
|
||||||
|
"--host", "0.0.0.0", \
|
||||||
|
"--port", "30000", \
|
||||||
|
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
|
||||||
|
"--tp", "1", \
|
||||||
|
"--api-key", "token-abc123", \
|
||||||
|
"--enable-metrics"]
|
||||||
|
|
@ -0,0 +1,177 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||||
|
|
||||||
|
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||||
|
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/QwQ-32B /root/.cradle/Alibaba/QwQ-32B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000
|
||||||
|
|
||||||
|
# ---- 启动 SGLang 推理服务 ----
|
||||||
|
CMD ["python3", "-m", "sglang.launch_server", \
|
||||||
|
"--host", "0.0.0.0", \
|
||||||
|
"--port", "30000", \
|
||||||
|
"--model-path", "/root/.cradle/Alibaba/QwQ-32B/", \
|
||||||
|
"--tp", "4", \
|
||||||
|
"--api-key", "token-abc123", \
|
||||||
|
"--enable-metrics"]
|
||||||
|
|
@ -0,0 +1,191 @@
|
||||||
|
###############################################################################
|
||||||
|
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||||
|
|
||||||
|
ENV USE_CUDA=1 \
|
||||||
|
USE_DISTRIBUTED=1 \
|
||||||
|
USE_MPI=1 \
|
||||||
|
USE_GLOO=1 \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_SYSTEM_NCCL=1 \
|
||||||
|
BUILD_TEST=0
|
||||||
|
|
||||||
|
ARG MAX_JOBS=90
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||||
|
libopenblas-dev libopenmpi-dev \
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||||
|
libjpeg-dev libpng-dev ca-certificates && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||||
|
|
||||||
|
WORKDIR /opt/pytorch
|
||||||
|
ENV MAX_JOBS=${MAX_JOBS}
|
||||||
|
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||||
|
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||||
|
libopenmpi-dev libopenblas-dev\
|
||||||
|
libnccl2=2.22.3-1+cuda12.6 \
|
||||||
|
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||||
|
|
||||||
|
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||||
|
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||||
|
RUN set -e && \
|
||||||
|
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||||
|
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||||
|
WORKDIR /opt/vision
|
||||||
|
RUN python3 setup.py bdist_wheel
|
||||||
|
|
||||||
|
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
|
WORKDIR /opt/flashinfer
|
||||||
|
|
||||||
|
RUN pip install . && \
|
||||||
|
python3 -m pip wheel . --no-deps -w dist/
|
||||||
|
|
||||||
|
|
||||||
|
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||||
|
# WORKDIR /opt
|
||||||
|
# RUN pip install setuptools wheel setuptools_scm && \
|
||||||
|
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||||
|
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||||
|
|
||||||
|
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||||
|
WORKDIR /opt
|
||||||
|
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||||
|
COPY ./sglang /sgl/sglang
|
||||||
|
WORKDIR /sgl/sglang/python
|
||||||
|
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||||
|
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||||
|
|
||||||
|
|
||||||
|
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||||
|
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||||
|
|
||||||
|
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||||
|
RUN mkdir -p /wheels && \
|
||||||
|
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||||
|
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||||
|
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||||
|
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||||
|
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||||
|
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||||
|
RUN pip wheel \
|
||||||
|
pydantic orjson psutil pyzmq pynvml \
|
||||||
|
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||||
|
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||||
|
-w /wheels
|
||||||
|
|
||||||
|
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||||
|
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||||
|
###############################################################################
|
||||||
|
ARG CUDA_VERSION=12.6.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||||
|
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||||
|
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||||
|
&& python3 -m pip install --no-cache-dir xgrammar
|
||||||
|
|
||||||
|
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
|
# 👇建议在后面补上
|
||||||
|
RUN ldconfig
|
||||||
|
|
||||||
|
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||||
|
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||||
|
|
||||||
|
|
||||||
|
COPY --from=builder-extras /wheels /tmp/wheels
|
||||||
|
|
||||||
|
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||||
|
RUN ls -lh /tmp/wheels && \
|
||||||
|
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||||
|
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||||
|
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||||
|
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||||
|
rm -rf /tmp/wheels
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ✅ 安装 Prometheus client
|
||||||
|
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||||
|
|
||||||
|
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||||
|
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 确保目录存在
|
||||||
|
RUN mkdir -p /tmp/prometheus
|
||||||
|
|
||||||
|
# ✅ 添加 Tini(推荐)
|
||||||
|
ENV TINI_VERSION=v0.19.0
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||||
|
RUN chmod +x /tini
|
||||||
|
ENTRYPOINT ["/tini", "--"]
|
||||||
|
|
||||||
|
# ---- 拷贝模型(路径可换) ----
|
||||||
|
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||||
|
|
||||||
|
# ---- 暴露端口 ----
|
||||||
|
EXPOSE 30000 30001
|
||||||
|
|
||||||
|
# 安装 supervisor
|
||||||
|
RUN apt-get update && apt-get install -y supervisor && \
|
||||||
|
mkdir -p /etc/supervisor/conf.d
|
||||||
|
|
||||||
|
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||||
|
COPY ./meta_ui.py /app/meta_ui.py
|
||||||
|
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||||
|
|
||||||
|
# 作为容器主进程运行 supervisor
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||||
|
|
@ -0,0 +1,224 @@
|
||||||
|
import json, datetime, textwrap, requests, gradio as gr
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import deque
|
||||||
|
import queue, threading, time
|
||||||
|
|
||||||
|
# ────────────────── 基础配置 ──────────────────
|
||||||
|
API_KEY = "token-abc123"
|
||||||
|
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
|
||||||
|
|
||||||
|
|
||||||
|
def model_name(path: Path):
|
||||||
|
cfg = path / "config.json"
|
||||||
|
if cfg.exists():
|
||||||
|
data = json.load(cfg.open())
|
||||||
|
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
|
||||||
|
return path.name
|
||||||
|
|
||||||
|
MODEL_NAME = model_name(MODEL_PATH)
|
||||||
|
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
|
||||||
|
|
||||||
|
# ────────────────── 日志队列 ──────────────────
|
||||||
|
LOG_Q: "queue.Queue[str]" = queue.Queue()
|
||||||
|
LOG_TXT = ""
|
||||||
|
|
||||||
|
|
||||||
|
def log(msg):
|
||||||
|
print(msg, flush=True)
|
||||||
|
LOG_Q.put(msg)
|
||||||
|
|
||||||
|
|
||||||
|
prev_log_value = ""
|
||||||
|
|
||||||
|
def consume_logs(dummy=None):
|
||||||
|
global LOG_TXT, prev_log_value
|
||||||
|
buf = deque(LOG_TXT.splitlines(), maxlen=400)
|
||||||
|
while not LOG_Q.empty():
|
||||||
|
buf.append(LOG_Q.get())
|
||||||
|
LOG_TXT = "\n".join(buf)
|
||||||
|
if LOG_TXT != prev_log_value:
|
||||||
|
prev_log_value = LOG_TXT
|
||||||
|
return gr.update(value=LOG_TXT)
|
||||||
|
return gr.update()
|
||||||
|
|
||||||
|
|
||||||
|
# ────────────────── 后端调用 ──────────────────
|
||||||
|
def backend(text, sampling, api_suffix):
|
||||||
|
url = f"http://localhost:30000{api_suffix}"
|
||||||
|
if api_suffix == "/generate":
|
||||||
|
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
|
||||||
|
elif api_suffix == "/v1/completions":
|
||||||
|
payload = {
|
||||||
|
"model": MODEL_NAME,
|
||||||
|
"prompt": text,
|
||||||
|
**sampling
|
||||||
|
}
|
||||||
|
elif api_suffix == "/v1/chat/completions":
|
||||||
|
payload = {
|
||||||
|
"model": MODEL_NAME,
|
||||||
|
"messages": text, # ← 这里 text 实际是 messages list
|
||||||
|
**sampling
|
||||||
|
}
|
||||||
|
|
||||||
|
log(f"\n🟡 [{now()}] POST {url}\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
|
||||||
|
try:
|
||||||
|
r = requests.post(url,
|
||||||
|
headers={"Authorization": f"Bearer {API_KEY}",
|
||||||
|
"Content-Type": "application/json"},
|
||||||
|
json=payload, timeout=180)
|
||||||
|
try:
|
||||||
|
data = r.json()
|
||||||
|
except Exception:
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
if api_suffix == "/generate":
|
||||||
|
txt = data.get("text", "").strip()
|
||||||
|
meta = data.get("meta_info", {})
|
||||||
|
fr = meta.get("finish_reason")
|
||||||
|
ctok = meta.get("completion_tokens")
|
||||||
|
elif api_suffix == "/v1/completions":
|
||||||
|
choice = data.get("choices", [{}])[0]
|
||||||
|
txt = choice.get("text", "").strip()
|
||||||
|
fr = choice.get("finish_reason")
|
||||||
|
ctok = data.get("usage", {}).get("completion_tokens")
|
||||||
|
elif api_suffix == "/v1/chat/completions":
|
||||||
|
choice = data.get("choices", [{}])[0]
|
||||||
|
msg = choice.get("message", {})
|
||||||
|
txt = msg.get("content", "").strip()
|
||||||
|
|
||||||
|
# 新增:从 usage 获取 completion_tokens
|
||||||
|
ctok = data.get("usage", {}).get("completion_tokens")
|
||||||
|
fr = choice.get("finish_reason") # 如果后续需要 finish reason
|
||||||
|
|
||||||
|
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
|
||||||
|
f"🟢 resp={r.text!r}")
|
||||||
|
if r.status_code != 200:
|
||||||
|
return f"[HTTP {r.status_code}] {r.text}"
|
||||||
|
return txt or "[⚠ 空]"
|
||||||
|
except Exception as e:
|
||||||
|
log(f"[❌ 请求异常] {e}")
|
||||||
|
return f"[❌ 请求异常] {e}"
|
||||||
|
|
||||||
|
|
||||||
|
# ────────────────── Chat 回调 ──────────────────
|
||||||
|
def chat(
|
||||||
|
user_msg, history,
|
||||||
|
max_new, temp, top_p, top_k,
|
||||||
|
rep_pen, pres_pen, stop_raw,
|
||||||
|
api_suffix, log_state
|
||||||
|
):
|
||||||
|
from queue import Queue, Empty
|
||||||
|
|
||||||
|
user = user_msg["text"] if isinstance(user_msg, dict) and "text" in user_msg else user_msg
|
||||||
|
|
||||||
|
if api_suffix == "/v1/chat/completions":
|
||||||
|
# 给 LLM 的完整 history(用于上下文推理)
|
||||||
|
messages = history[:]
|
||||||
|
messages.append({"role": "user", "content": user})
|
||||||
|
prompt_input = messages
|
||||||
|
else:
|
||||||
|
prompt_input = user
|
||||||
|
|
||||||
|
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
|
||||||
|
samp = {
|
||||||
|
("max_tokens" if api_suffix == "/v1/completions" else "max_new_tokens"): int(max_new),
|
||||||
|
"temperature": temp,
|
||||||
|
"top_p": top_p,
|
||||||
|
"top_k": int(top_k),
|
||||||
|
"repetition_penalty": rep_pen,
|
||||||
|
"presence_penalty": pres_pen,
|
||||||
|
**({"stop": stop} if stop else {})
|
||||||
|
}
|
||||||
|
|
||||||
|
result_q = Queue()
|
||||||
|
|
||||||
|
def worker():
|
||||||
|
out = backend(prompt_input, samp, api_suffix)
|
||||||
|
result_q.put(out)
|
||||||
|
|
||||||
|
thread = threading.Thread(target=worker, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
if api_suffix == "/v1/chat/completions":
|
||||||
|
while True:
|
||||||
|
if not thread.is_alive() and result_q.empty():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
result = result_q.get(timeout=0.1)
|
||||||
|
except Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
txt = result.strip() if isinstance(result, str) else str(result).strip()
|
||||||
|
|
||||||
|
yield {"text": txt}, log_state
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
while thread.is_alive():
|
||||||
|
try:
|
||||||
|
result = result_q.get(timeout=0.1)
|
||||||
|
break
|
||||||
|
except Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(result, str):
|
||||||
|
result = {"text": result}
|
||||||
|
elif not isinstance(result, dict) or "text" not in result:
|
||||||
|
result = {"text": str(result)}
|
||||||
|
|
||||||
|
yield result["text"], log_state
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
# ────────────────── Gradio UI ──────────────────
|
||||||
|
with gr.Blocks(title="调试界面") as demo:
|
||||||
|
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
api_choice = gr.Dropdown(choices=["/generate", "/v1/completions", "/v1/chat/completions"],
|
||||||
|
value="/generate", label="选择推理接口")
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
max_new = gr.Slider(32, 32768, 1024, label="max_new_tokens")
|
||||||
|
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
|
||||||
|
with gr.Row():
|
||||||
|
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
|
||||||
|
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
|
||||||
|
with gr.Row():
|
||||||
|
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
|
||||||
|
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
|
||||||
|
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
|
||||||
|
|
||||||
|
log_state = gr.State("")
|
||||||
|
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False)
|
||||||
|
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)
|
||||||
|
|
||||||
|
chat = gr.ChatInterface(
|
||||||
|
fn=chat,
|
||||||
|
additional_inputs=[max_new, temp, top_p, top_k,
|
||||||
|
rep_pen, pres_pen, stop_txt,
|
||||||
|
api_choice, log_state],
|
||||||
|
additional_outputs=[log_state],
|
||||||
|
type="messages"
|
||||||
|
)
|
||||||
|
|
||||||
|
timer = gr.Timer(1.0, render=True)
|
||||||
|
timer.tick(
|
||||||
|
fn=consume_logs,
|
||||||
|
inputs=[],
|
||||||
|
outputs=[log_box],
|
||||||
|
)
|
||||||
|
|
||||||
|
def clear_all_logs(_):
|
||||||
|
global LOG_Q, LOG_TXT, prev_log_value
|
||||||
|
with LOG_Q.mutex:
|
||||||
|
LOG_Q.queue.clear()
|
||||||
|
LOG_TXT = ""
|
||||||
|
prev_log_value = ""
|
||||||
|
return gr.update(value=""), gr.update(value="")
|
||||||
|
|
||||||
|
api_choice.change(fn=clear_all_logs, inputs=api_choice, outputs=[log_state, log_box])
|
||||||
|
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
|
||||||
|
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
|
||||||
|
|
||||||
|
|
||||||
|
demo.launch(server_name="0.0.0.0", server_port=30001)
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
import gradio as gr
|
||||||
|
import requests
|
||||||
|
|
||||||
|
API_URL = "http://localhost:30000/v1/completions"
|
||||||
|
API_KEY = "token-abc123"
|
||||||
|
MODEL_NAME = "Qwen3-14b-base"
|
||||||
|
|
||||||
|
# 构造 prompt:Base 模型靠拼接上下文
|
||||||
|
def build_prompt(history, user_message):
|
||||||
|
prompt = ""
|
||||||
|
for user, bot in history:
|
||||||
|
prompt += f"User: {user}\nAssistant: {bot}\n"
|
||||||
|
prompt += f"User: {user_message}\nAssistant:"
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
# 主对话函数
|
||||||
|
def chat(user_message, history, max_tokens, temperature):
|
||||||
|
prompt = build_prompt(history, user_message)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {API_KEY}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"model": MODEL_NAME,
|
||||||
|
"prompt": prompt,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"temperature": temperature,
|
||||||
|
"stop": ["\nUser:", "\nAssistant:"]
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
|
||||||
|
result = response.json()
|
||||||
|
reply = result["choices"][0]["text"].strip()
|
||||||
|
except Exception as e:
|
||||||
|
reply = f"[请求失败] {e}"
|
||||||
|
|
||||||
|
return reply
|
||||||
|
|
||||||
|
# 手动测试 API 功能
|
||||||
|
def test_api_connection(max_tokens, temperature):
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {API_KEY}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"model": MODEL_NAME,
|
||||||
|
"prompt": "Ping?",
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"temperature": temperature
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(API_URL, headers=headers, json=payload, timeout=10)
|
||||||
|
out = resp.json()["choices"][0]["text"].strip()
|
||||||
|
return f"✅ API 可用,响应: {out}"
|
||||||
|
except Exception as e:
|
||||||
|
return f"❌ API 请求失败: {e}"
|
||||||
|
|
||||||
|
# Gradio 控件组合
|
||||||
|
with gr.Blocks(title="Base 模型测试 UI") as demo:
|
||||||
|
gr.Markdown("# 💬 Base 模型对话界面")
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
max_tokens = gr.Slider(32, 1024, value=256, label="max_tokens")
|
||||||
|
temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
|
||||||
|
test_btn = gr.Button("🔁 测试 API 可用性")
|
||||||
|
test_output = gr.Textbox(label="API 测试结果", interactive=False)
|
||||||
|
|
||||||
|
chatbot = gr.ChatInterface(
|
||||||
|
fn=lambda msg, hist: chat(msg, hist, max_tokens.value, temperature.value),
|
||||||
|
title=None
|
||||||
|
)
|
||||||
|
|
||||||
|
test_btn.click(fn=test_api_connection, inputs=[max_tokens, temperature], outputs=test_output)
|
||||||
|
|
||||||
|
# 启动服务
|
||||||
|
demo.launch(server_name="0.0.0.0", server_port=30001)
|
||||||
|
|
@ -0,0 +1,153 @@
|
||||||
|
import json, datetime, textwrap, requests, gradio as gr
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import deque
|
||||||
|
import queue, threading, time
|
||||||
|
|
||||||
|
# ───────────────────── 基础配置 ─────────────────────
|
||||||
|
API_URL = "http://localhost:30000/generate"
|
||||||
|
API_KEY = "token-abc123"
|
||||||
|
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
|
||||||
|
|
||||||
|
def model_name(path: Path):
|
||||||
|
cfg = path / "config.json"
|
||||||
|
if cfg.exists():
|
||||||
|
data = json.load(cfg.open())
|
||||||
|
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
|
||||||
|
return path.name
|
||||||
|
|
||||||
|
MODEL_NAME = model_name(MODEL_PATH)
|
||||||
|
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
|
||||||
|
|
||||||
|
# ───────────────────── 日志队列 ─────────────────────
|
||||||
|
LOG_Q: "queue.Queue[str]" = queue.Queue()
|
||||||
|
LOG_TXT = "" # ✅ 全局日志缓存,避免 chat 焦点阻断 log_box 更新
|
||||||
|
|
||||||
|
def log(msg): # 写终端 + 推队列
|
||||||
|
print(msg, flush=True)
|
||||||
|
LOG_Q.put(msg)
|
||||||
|
|
||||||
|
prev_log_value = "" # 上一帧的日志内容
|
||||||
|
|
||||||
|
def consume_logs(dummy=None):
|
||||||
|
"""每秒更新 log_box 内容,避免 chat 阻塞 UI 刷新"""
|
||||||
|
global LOG_TXT, prev_log_value
|
||||||
|
buf = deque(LOG_TXT.splitlines(), maxlen=400)
|
||||||
|
while not LOG_Q.empty():
|
||||||
|
buf.append(LOG_Q.get())
|
||||||
|
LOG_TXT = "\n".join(buf)
|
||||||
|
if LOG_TXT != prev_log_value:
|
||||||
|
prev_log_value = LOG_TXT
|
||||||
|
return gr.update(value=LOG_TXT)
|
||||||
|
return gr.update() # 无更新则不触发前端刷新
|
||||||
|
|
||||||
|
|
||||||
|
# ───────────────────── 后端调用 ─────────────────────
|
||||||
|
def backend(text, sampling):
|
||||||
|
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
|
||||||
|
log(f"\n🟡 [{now()}] payload\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
|
||||||
|
try:
|
||||||
|
r = requests.post(API_URL,
|
||||||
|
headers={"Authorization": f"Bearer {API_KEY}",
|
||||||
|
"Content-Type": "application/json"},
|
||||||
|
json=payload, timeout=180)
|
||||||
|
try:
|
||||||
|
data = r.json()
|
||||||
|
except Exception:
|
||||||
|
data = {}
|
||||||
|
fr = data.get("meta_info", {}).get("finish_reason")
|
||||||
|
ctok = data.get("meta_info", {}).get("completion_tokens")
|
||||||
|
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
|
||||||
|
f"🟢 resp800={r.text[:800]!r}")
|
||||||
|
if r.status_code != 200:
|
||||||
|
return f"[HTTP {r.status_code}] {r.text[:300]}"
|
||||||
|
return data.get("text", "").strip() or "[⚠ 空]"
|
||||||
|
except Exception as e:
|
||||||
|
log(f"[❌ 请求异常] {e}")
|
||||||
|
return f"[❌ 请求异常] {e}"
|
||||||
|
|
||||||
|
# ───────────────────── Chat 回调 ─────────────────────
|
||||||
|
def chat(
|
||||||
|
user, history,
|
||||||
|
max_new, temp, top_p, top_k,
|
||||||
|
rep_pen, pres_pen, stop_raw,
|
||||||
|
log_state
|
||||||
|
):
|
||||||
|
import threading
|
||||||
|
from queue import Queue, Empty
|
||||||
|
|
||||||
|
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
|
||||||
|
samp = {
|
||||||
|
"max_new_tokens": int(max_new),
|
||||||
|
"temperature": temp,
|
||||||
|
"top_p": top_p,
|
||||||
|
"top_k": int(top_k),
|
||||||
|
"repetition_penalty": rep_pen,
|
||||||
|
"presence_penalty": pres_pen,
|
||||||
|
**({"stop": stop} if stop else {})
|
||||||
|
}
|
||||||
|
|
||||||
|
result_q = Queue()
|
||||||
|
|
||||||
|
# 后台线程执行 backend 推理
|
||||||
|
def worker():
|
||||||
|
out = backend(user, samp)
|
||||||
|
result_q.put(out)
|
||||||
|
|
||||||
|
thread = threading.Thread(target=worker)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
# 先返回提示
|
||||||
|
yield "⏳ 正在生成中...", log_state
|
||||||
|
|
||||||
|
# 每 0.1 秒轮询结果队列(避免阻塞 UI)
|
||||||
|
while thread.is_alive() or not result_q.empty():
|
||||||
|
try:
|
||||||
|
result = result_q.get(timeout=0.1)
|
||||||
|
yield result, log_state
|
||||||
|
except Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
# ───────────────────── Gradio UI ─────────────────────
|
||||||
|
with gr.Blocks(title="调试界面") as demo:
|
||||||
|
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
|
||||||
|
|
||||||
|
# 采样参数控件
|
||||||
|
with gr.Row():
|
||||||
|
max_new = gr.Slider(32, 32768, 128, label="max_new_tokens")
|
||||||
|
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
|
||||||
|
with gr.Row():
|
||||||
|
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
|
||||||
|
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
|
||||||
|
with gr.Row():
|
||||||
|
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
|
||||||
|
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
|
||||||
|
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
|
||||||
|
|
||||||
|
log_state = gr.State("") # 状态透传
|
||||||
|
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False) # ✅ 默认关闭
|
||||||
|
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False) # ✅ 默认隐藏
|
||||||
|
|
||||||
|
# Chat 界面(移到日志之前)
|
||||||
|
chatbot = gr.ChatInterface(
|
||||||
|
fn=chat,
|
||||||
|
additional_inputs=[max_new, temp, top_p, top_k,
|
||||||
|
rep_pen, pres_pen, stop_txt, log_state],
|
||||||
|
additional_outputs=[log_state],
|
||||||
|
type="messages"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# 日志刷新定时器
|
||||||
|
timer = gr.Timer(1.0, render=True)
|
||||||
|
timer.tick(
|
||||||
|
fn=consume_logs,
|
||||||
|
inputs=[],
|
||||||
|
outputs=[log_box],
|
||||||
|
)
|
||||||
|
|
||||||
|
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
|
||||||
|
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
|
||||||
|
|
||||||
|
|
||||||
|
demo.launch(server_name="0.0.0.0", server_port=30001)
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -216,9 +216,13 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health() -> Response:
|
async def health():
|
||||||
"""Check the health of the http server."""
|
"""Check the health of the http server and return version info."""
|
||||||
return Response(status_code=200)
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"name": "sglang_0.4.8.post1",
|
||||||
|
"version": "v1.0.0" # 这里写上你希望显示的版本号
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health_generate")
|
@app.get("/health_generate")
|
||||||
|
|
|
||||||
|
|
@ -868,12 +868,22 @@ def set_ulimit(target_soft_limit=65535):
|
||||||
def add_api_key_middleware(app, api_key: str):
|
def add_api_key_middleware(app, api_key: str):
|
||||||
@app.middleware("http")
|
@app.middleware("http")
|
||||||
async def authentication(request, call_next):
|
async def authentication(request, call_next):
|
||||||
|
# OPTIONS 请求(CORS 预检)直接放行
|
||||||
if request.method == "OPTIONS":
|
if request.method == "OPTIONS":
|
||||||
return await call_next(request)
|
return await call_next(request)
|
||||||
if request.url.path.startswith("/health"):
|
|
||||||
return await call_next(request)
|
# 明确列出无需鉴权的路径前缀
|
||||||
if request.url.path.startswith("/metrics"):
|
whitelist_prefixes = (
|
||||||
|
"/health",
|
||||||
|
"/metrics",
|
||||||
|
"/ping",
|
||||||
|
"/get_model_info",
|
||||||
|
)
|
||||||
|
|
||||||
|
if any(request.url.path.startswith(prefix) for prefix in whitelist_prefixes):
|
||||||
return await call_next(request)
|
return await call_next(request)
|
||||||
|
|
||||||
|
# Bearer Token 鉴权
|
||||||
if request.headers.get("Authorization") != "Bearer " + api_key:
|
if request.headers.get("Authorization") != "Bearer " + api_key:
|
||||||
return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
|
return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
|
||||||
return await call_next(request)
|
return await call_next(request)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
[supervisord]
|
||||||
|
nodaemon=true
|
||||||
|
logfile=/dev/stdout
|
||||||
|
logfile_maxbytes=0
|
||||||
|
loglevel=info
|
||||||
|
|
||||||
|
[program:sglang]
|
||||||
|
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
stdout_logfile=/dev/stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
|
[program:ui]
|
||||||
|
command=python3 /app/meta_ui.py --port 30001
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
stdout_logfile=/dev/stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
[supervisord]
|
||||||
|
nodaemon=true
|
||||||
|
logfile=/dev/stdout
|
||||||
|
logfile_maxbytes=0
|
||||||
|
loglevel=info
|
||||||
|
|
||||||
|
[program:sglang]
|
||||||
|
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/Alibaba/Qwen3-30B-A3B/ --tp 4 --api-key token-abc123 --enable-metrics
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
stdout_logfile=/dev/stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
|
[program:ui]
|
||||||
|
command=python3 /app/meta_ui.py --port 30001
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
stdout_logfile=/dev/stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
Loading…
Reference in New Issue