Compare commits
No commits in common. "main" and "v1.0.0" have entirely different histories.
64
Dockerfile
64
Dockerfile
|
|
@ -89,9 +89,8 @@ WORKDIR /sgl/sglang/python
|
|||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
|
||||
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
|
|
@ -100,7 +99,6 @@ RUN mkdir -p /wheels && \
|
|||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
|
|
@ -110,9 +108,6 @@ RUN pip wheel \
|
|||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
|
|
@ -122,7 +117,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
|||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
python3 python3-dev python3-pip python3-distutils ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
|
|
@ -135,35 +130,27 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
|
|||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||
|
||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
# # 安装运行时漏掉的依赖
|
||||
# RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton
|
||||
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
# ✅ 离线安装全部依赖(包含所有运行时必需包)
|
||||
# RUN python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
# python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
# rm -rf /tmp/wheels
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
|
|
@ -172,20 +159,15 @@ RUN chmod +x /tini
|
|||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000 30001
|
||||
EXPOSE 30000
|
||||
|
||||
# 安装 supervisor
|
||||
RUN apt-get update && apt-get install -y supervisor && \
|
||||
mkdir -p /etc/supervisor/conf.d
|
||||
|
||||
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||
COPY ./meta_ui.py /app/meta_ui.py
|
||||
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||
|
||||
# 作为容器主进程运行 supervisor
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||
# ---- 启动 SGLang 推理服务 ----
|
||||
CMD ["python3", "-m", "sglang.launch_server", \
|
||||
"--host", "0.0.0.0", \
|
||||
"--port", "30000", \
|
||||
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
|
||||
"--tp", "1", \
|
||||
"--api-key", "token-abc123"]
|
||||
|
|
@ -1,191 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
|
||||
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000 30001
|
||||
|
||||
# 安装 supervisor
|
||||
RUN apt-get update && apt-get install -y supervisor && \
|
||||
mkdir -p /etc/supervisor/conf.d
|
||||
|
||||
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||
COPY ./meta_ui.py /app/meta_ui.py
|
||||
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||
|
||||
# 作为容器主进程运行 supervisor
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||
|
|
@ -1,191 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
|
||||
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-30B-A3B-Base /root/.cradle/Alibaba/Qwen3-30B-A3B-Base
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000 30001
|
||||
|
||||
# 安装 supervisor
|
||||
RUN apt-get update && apt-get install -y supervisor && \
|
||||
mkdir -p /etc/supervisor/conf.d
|
||||
|
||||
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||
COPY ./meta_ui.py /app/meta_ui.py
|
||||
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||
|
||||
# 作为容器主进程运行 supervisor
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
############################################################
|
||||
# Stage-0: 构建依赖轮子(PyTorch + SGLang + sgl_kernel) #
|
||||
############################################################
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS builder
|
||||
|
||||
# ---- Python 环境 ----
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends python3 python3-pip python3-distutils && \
|
||||
ln -sf /usr/bin/python3 /usr/bin/python && \
|
||||
python -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six
|
||||
|
||||
# ---- PyTorch / torchvision / SGLang / sgl_kernel ----
|
||||
ARG TORCH_VER=2.7.1
|
||||
ARG TV_VER=0.22.1
|
||||
RUN case "$CUDA_VERSION" in \
|
||||
12.6.1) CUINDEX=126 ;; \
|
||||
12.8.1) CUINDEX=128 ;; \
|
||||
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
|
||||
esac && \
|
||||
python -m pip install --no-cache-dir \
|
||||
torch==${TORCH_VER}+cu${CUINDEX} \
|
||||
torchvision==${TV_VER}+cu${CUINDEX} \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} && \
|
||||
python -m pip install --no-cache-dir \
|
||||
sglang==0.4.8.post1 \
|
||||
sgl-kernel==0.0.2.post17 \
|
||||
nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps && \
|
||||
# ✅ 补全依赖(必须)
|
||||
python -m pip install --no-cache-dir \
|
||||
pydantic psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle orjson uvloop sentencepiece
|
||||
# ✅ 测试模块完整性
|
||||
#python -c "import sglang, torch, pydantic, transformers, sgl_kernel"
|
||||
|
||||
############################################################
|
||||
# Stage-1: 生成最小运行镜像 #
|
||||
############################################################
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# ---- Python runtime ----
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends python3 python3-distutils && \
|
||||
ln -sf /usr/bin/python3 /usr/bin/python && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# ---- 拷贝 Python 包和入口 ----
|
||||
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
|
||||
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
|
||||
|
||||
# ---- 启动服务 ----
|
||||
EXPOSE 30000
|
||||
CMD ["python3", "-m", "sglang.launch_server", \
|
||||
"--host", "0.0.0.0", \
|
||||
"--port", "30000", \
|
||||
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
|
||||
"--tp", "1", \
|
||||
"--api-key", "token-abc123"]
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||
|
||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Deepseek/DeepSeek-R1-Distill-Llama-70B /root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000
|
||||
|
||||
# ---- 启动 SGLang 推理服务 ----
|
||||
CMD ["python3", "-m", "sglang.launch_server", \
|
||||
"--host", "0.0.0.0", \
|
||||
"--port", "30000", \
|
||||
"--model-path", "/root/.cradle/Deepseek/DeepSeek-R1-Distill-Llama-70B/", \
|
||||
"--tp", "4", \
|
||||
"--api-key", "token-abc123", \
|
||||
"--enable-metrics"]
|
||||
|
|
@ -1,191 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
|
||||
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
# COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000 30001
|
||||
|
||||
# 安装 supervisor
|
||||
RUN apt-get update && apt-get install -y supervisor && \
|
||||
mkdir -p /etc/supervisor/conf.d
|
||||
|
||||
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||
COPY ./meta_ui.py /app/meta_ui.py
|
||||
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||
|
||||
# 作为容器主进程运行 supervisor
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||
|
||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-14B /root/.cradle/Alibaba/Qwen3-14B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000
|
||||
|
||||
# ---- 启动 SGLang 推理服务 ----
|
||||
CMD ["python3", "-m", "sglang.launch_server", \
|
||||
"--host", "0.0.0.0", \
|
||||
"--port", "30000", \
|
||||
"--model-path", "/root/.cradle/Alibaba/Qwen3-14B/", \
|
||||
"--tp", "2", \
|
||||
"--api-key", "token-abc123", \
|
||||
"--enable-metrics"]
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||
RUN pip wheel gradio requests -w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||
|
||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-14B-Base /root/.cradle/Alibaba/Qwen3-14B-Base
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000 30001
|
||||
|
||||
# 安装 supervisor
|
||||
RUN apt-get update && apt-get install -y supervisor && \
|
||||
mkdir -p /etc/supervisor/conf.d
|
||||
|
||||
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||
COPY ./meta_ui.py /app/meta_ui.py
|
||||
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||
|
||||
# 作为容器主进程运行 supervisor
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||
|
||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-32B /root/.cradle/Alibaba/Qwen3-32B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000
|
||||
|
||||
# ---- 启动 SGLang 推理服务 ----
|
||||
CMD ["python3", "-m", "sglang.launch_server", \
|
||||
"--host", "0.0.0.0", \
|
||||
"--port", "30000", \
|
||||
"--model-path", "/root/.cradle/Alibaba/Qwen3-32B/", \
|
||||
"--tp", "4", \
|
||||
"--api-key", "token-abc123", \
|
||||
"--enable-metrics"]
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||
|
||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-8B /root/.cradle/Alibaba/Qwen3-8B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000
|
||||
|
||||
# ---- 启动 SGLang 推理服务 ----
|
||||
CMD ["python3", "-m", "sglang.launch_server", \
|
||||
"--host", "0.0.0.0", \
|
||||
"--port", "30000", \
|
||||
"--model-path", "/root/.cradle/Alibaba/Qwen3-8B/", \
|
||||
"--tp", "1", \
|
||||
"--api-key", "token-abc123", \
|
||||
"--enable-metrics"]
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
# ── sgl-kernel 的 Python 模块 ───────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel -d /tmp/sgl_kernel_wheel
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
|
||||
|
||||
#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
|
||||
python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/QwQ-32B /root/.cradle/Alibaba/QwQ-32B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000
|
||||
|
||||
# ---- 启动 SGLang 推理服务 ----
|
||||
CMD ["python3", "-m", "sglang.launch_server", \
|
||||
"--host", "0.0.0.0", \
|
||||
"--port", "30000", \
|
||||
"--model-path", "/root/.cradle/Alibaba/QwQ-32B/", \
|
||||
"--tp", "4", \
|
||||
"--api-key", "token-abc123", \
|
||||
"--enable-metrics"]
|
||||
191
Dockerfile.tmp
191
Dockerfile.tmp
|
|
@ -1,191 +0,0 @@
|
|||
###############################################################################
|
||||
# Stage 0 ─ builder-torch:编译 PyTorch 2.7.1 (+cu126)
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-torch
|
||||
|
||||
ENV USE_CUDA=1 \
|
||||
USE_DISTRIBUTED=1 \
|
||||
USE_MPI=1 \
|
||||
USE_GLOO=1 \
|
||||
USE_NCCL=1 \
|
||||
USE_SYSTEM_NCCL=1 \
|
||||
BUILD_TEST=0
|
||||
|
||||
ARG MAX_JOBS=90
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-dev python3-pip python3-distutils git cmake ninja-build \
|
||||
libopenblas-dev libopenmpi-dev \
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 \
|
||||
libjpeg-dev libpng-dev ca-certificates && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools sympy pyyaml typing-extensions numpy
|
||||
|
||||
WORKDIR /opt
|
||||
RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git
|
||||
|
||||
WORKDIR /opt/pytorch
|
||||
ENV MAX_JOBS=${MAX_JOBS}
|
||||
RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
###############################################################################
|
||||
# Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS builder-extras
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-distutils python3.10-dev git build-essential \
|
||||
cmake ninja-build libjpeg-dev libpng-dev ca-certificates \
|
||||
libopenmpi-dev libopenblas-dev\
|
||||
libnccl2=2.22.3-1+cuda12.6 \
|
||||
libnccl-dev=2.22.3-1+cuda12.6 && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip wheel setuptools
|
||||
|
||||
# ── 安装自编 torch 轮子 ──────────────────────────────────────────────────────
|
||||
COPY --from=builder-torch /opt/pytorch/dist /tmp/torch_dist
|
||||
RUN set -e && \
|
||||
echo "==> Files in /tmp/torch_dist:" && ls -lh /tmp/torch_dist && \
|
||||
find /tmp/torch_dist -name 'torch-*.whl' -print | xargs -r python3 -m pip install --no-cache-dir
|
||||
|
||||
|
||||
|
||||
# ── 编译 torchvision 0.22.1 (依赖本地 torch) ────────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone -b v0.22.1 https://github.com/pytorch/vision.git
|
||||
WORKDIR /opt/vision
|
||||
RUN python3 setup.py bdist_wheel
|
||||
|
||||
# ── 编译 flashinfer (主分支支持 torch 2.7 / cu126) ─────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||
WORKDIR /opt/flashinfer
|
||||
|
||||
RUN pip install . && \
|
||||
python3 -m pip wheel . --no-deps -w dist/
|
||||
|
||||
|
||||
# # ── 安装 vllm(跳过编译,直接装) ─────────────────────────────────────────────
|
||||
# WORKDIR /opt
|
||||
# RUN pip install setuptools wheel setuptools_scm && \
|
||||
# pip install git+https://github.com/vllm-project/vllm.git@main --no-deps && \
|
||||
# python3 -m pip wheel vllm -w /tmp/vllm_wheels --no-deps
|
||||
|
||||
# ── 下载 vllm 预编译 wheel,避免编译 flash-attn ───────────────────────────────
|
||||
WORKDIR /opt
|
||||
RUN pip download --only-binary=:all: --no-deps vllm==0.9.1 -d /tmp/vllm_wheels
|
||||
|
||||
|
||||
# ── 编译你本地 sglang 源码并打 wheel ───────────────────────────────────────
|
||||
COPY ./sglang /sgl/sglang
|
||||
WORKDIR /sgl/sglang/python
|
||||
RUN python3 -m pip install ".[srt,openai]" --no-build-isolation && \
|
||||
python3 -m pip wheel ".[srt,openai]" --no-deps -w /tmp/sg_wheels
|
||||
|
||||
|
||||
# ── 🔄 下载 sgl-kernel(与 sglang 同步)───────────────────────────────────────
|
||||
RUN pip download --only-binary=:all: --no-deps sgl-kernel==0.1.9 -d /tmp/sgl_kernel_wheels
|
||||
|
||||
# ── 收集所有 wheel 到 /wheels ──────────────────────────────────────────────
|
||||
RUN mkdir -p /wheels && \
|
||||
cp /tmp/torch_dist/torch*.whl /wheels/ && \
|
||||
cp /opt/vision/dist/torchvision-*.whl /wheels/ && \
|
||||
cp /opt/flashinfer/dist/flashinfer_python-*.whl /wheels/ && \
|
||||
cp /tmp/vllm_wheels/vllm-*.whl /wheels/ && \
|
||||
cp /tmp/sg_wheels/sglang-*.whl /wheels/ && \
|
||||
cp /tmp/sgl_kernel_wheels/sgl_kernel-*.whl /wheels/ && \
|
||||
pip wheel filelock typing-extensions sympy fsspec jinja2 networkx -w /wheels
|
||||
|
||||
# ── ✅ 再打包 runtime 阶段必需依赖 ────────────────────────────────────────────
|
||||
RUN pip wheel \
|
||||
pydantic orjson psutil pyzmq pynvml \
|
||||
transformers==4.52.0 uvicorn fastapi IPython aiohttp \
|
||||
setproctitle uvloop sentencepiece triton pillow cachetools msgspec blake3 cloudpickle compressed-tensors einops openai py-cpuinfo dill partial_json_parser python-multipart torchao \
|
||||
-w /wheels
|
||||
|
||||
# ── ✅ 打包 gradio UI 所需依赖 ────────────────────────────────────────────────
|
||||
RUN pip wheel "gradio==5.38.2" requests -w /wheels
|
||||
|
||||
###############################################################################
|
||||
# Stage 2 ─ runtime:极简运行镜像,仅离线安装 wheel
|
||||
###############################################################################
|
||||
ARG CUDA_VERSION=12.6.1
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ build-essential ninja-build cuda-compiler-12-6\
|
||||
python3 python3-dev python3-pip python3-distutils curl ca-certificates \
|
||||
libopenblas-dev libgomp1 libcupti-dev libnuma1 libopenmpi-dev openmpi-bin libnuma-dev libpng16-16 libjpeg8 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip \
|
||||
&& python3 -m pip install --no-cache-dir xgrammar
|
||||
|
||||
# 👉 拷贝 cupti 动态库(避免写死版本号)
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so.12 /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-linux-gnu/
|
||||
|
||||
# 👇建议在后面补上
|
||||
RUN ldconfig
|
||||
|
||||
# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
|
||||
COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
|
||||
|
||||
|
||||
COPY --from=builder-extras /wheels /tmp/wheels
|
||||
|
||||
# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖
|
||||
RUN ls -lh /tmp/wheels && \
|
||||
rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
|
||||
rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/gradio-5.38.2*.whl && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(ls /tmp/wheels | grep -v '^gradio-' | sed 's|^|/tmp/wheels/|') && \
|
||||
python3 -m pip install --no-cache-dir --no-deps $(find /tmp/wheels -maxdepth 1 -type f -name '*.whl' ! -name 'gradio-*') && \
|
||||
python3 -c "import gradio, sys; print('✅ Gradio version =', gradio.__version__)" && \
|
||||
rm -rf /tmp/wheels
|
||||
|
||||
|
||||
|
||||
# ✅ 安装 Prometheus client
|
||||
RUN python3 -m pip install --no-cache-dir prometheus_client
|
||||
|
||||
# ✅ 设置多进程 metrics 收集目录(用于 MultiProcessCollector)
|
||||
ENV PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus
|
||||
|
||||
# ✅ 确保目录存在
|
||||
RUN mkdir -p /tmp/prometheus
|
||||
|
||||
# ✅ 添加 Tini(推荐)
|
||||
ENV TINI_VERSION=v0.19.0
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
|
||||
RUN chmod +x /tini
|
||||
ENTRYPOINT ["/tini", "--"]
|
||||
|
||||
# ---- 拷贝模型(路径可换) ----
|
||||
COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=2s --start-period=300s --retries=5 CMD curl -fs http://localhost:30000/health || exit 1
|
||||
|
||||
# ---- 暴露端口 ----
|
||||
EXPOSE 30000 30001
|
||||
|
||||
# 安装 supervisor
|
||||
RUN apt-get update && apt-get install -y supervisor && \
|
||||
mkdir -p /etc/supervisor/conf.d
|
||||
|
||||
# 拷贝 supervisord 配置文件和 UI 脚本
|
||||
COPY ./meta_ui.py /app/meta_ui.py
|
||||
COPY ./supervisord.conf /etc/supervisor/supervisord.conf
|
||||
|
||||
# 作为容器主进程运行 supervisor
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
|
||||
224
meta_ui.py
224
meta_ui.py
|
|
@ -1,224 +0,0 @@
|
|||
import json, datetime, textwrap, requests, gradio as gr
|
||||
from pathlib import Path
|
||||
from collections import deque
|
||||
import queue, threading, time
|
||||
|
||||
# ────────────────── 基础配置 ──────────────────
|
||||
API_KEY = "token-abc123"
|
||||
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
|
||||
|
||||
|
||||
def model_name(path: Path):
|
||||
cfg = path / "config.json"
|
||||
if cfg.exists():
|
||||
data = json.load(cfg.open())
|
||||
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
|
||||
return path.name
|
||||
|
||||
MODEL_NAME = model_name(MODEL_PATH)
|
||||
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
# ────────────────── 日志队列 ──────────────────
|
||||
LOG_Q: "queue.Queue[str]" = queue.Queue()
|
||||
LOG_TXT = ""
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg, flush=True)
|
||||
LOG_Q.put(msg)
|
||||
|
||||
|
||||
prev_log_value = ""
|
||||
|
||||
def consume_logs(dummy=None):
|
||||
global LOG_TXT, prev_log_value
|
||||
buf = deque(LOG_TXT.splitlines(), maxlen=400)
|
||||
while not LOG_Q.empty():
|
||||
buf.append(LOG_Q.get())
|
||||
LOG_TXT = "\n".join(buf)
|
||||
if LOG_TXT != prev_log_value:
|
||||
prev_log_value = LOG_TXT
|
||||
return gr.update(value=LOG_TXT)
|
||||
return gr.update()
|
||||
|
||||
|
||||
# ────────────────── 后端调用 ──────────────────
|
||||
def backend(text, sampling, api_suffix):
|
||||
url = f"http://localhost:30000{api_suffix}"
|
||||
if api_suffix == "/generate":
|
||||
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
|
||||
elif api_suffix == "/v1/completions":
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"prompt": text,
|
||||
**sampling
|
||||
}
|
||||
elif api_suffix == "/v1/chat/completions":
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": text, # ← 这里 text 实际是 messages list
|
||||
**sampling
|
||||
}
|
||||
|
||||
log(f"\n🟡 [{now()}] POST {url}\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
|
||||
try:
|
||||
r = requests.post(url,
|
||||
headers={"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"},
|
||||
json=payload, timeout=180)
|
||||
try:
|
||||
data = r.json()
|
||||
except Exception:
|
||||
data = {}
|
||||
|
||||
if api_suffix == "/generate":
|
||||
txt = data.get("text", "").strip()
|
||||
meta = data.get("meta_info", {})
|
||||
fr = meta.get("finish_reason")
|
||||
ctok = meta.get("completion_tokens")
|
||||
elif api_suffix == "/v1/completions":
|
||||
choice = data.get("choices", [{}])[0]
|
||||
txt = choice.get("text", "").strip()
|
||||
fr = choice.get("finish_reason")
|
||||
ctok = data.get("usage", {}).get("completion_tokens")
|
||||
elif api_suffix == "/v1/chat/completions":
|
||||
choice = data.get("choices", [{}])[0]
|
||||
msg = choice.get("message", {})
|
||||
txt = msg.get("content", "").strip()
|
||||
|
||||
# 新增:从 usage 获取 completion_tokens
|
||||
ctok = data.get("usage", {}).get("completion_tokens")
|
||||
fr = choice.get("finish_reason") # 如果后续需要 finish reason
|
||||
|
||||
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
|
||||
f"🟢 resp={r.text!r}")
|
||||
if r.status_code != 200:
|
||||
return f"[HTTP {r.status_code}] {r.text}"
|
||||
return txt or "[⚠ 空]"
|
||||
except Exception as e:
|
||||
log(f"[❌ 请求异常] {e}")
|
||||
return f"[❌ 请求异常] {e}"
|
||||
|
||||
|
||||
# ────────────────── Chat 回调 ──────────────────
|
||||
def chat(
|
||||
user_msg, history,
|
||||
max_new, temp, top_p, top_k,
|
||||
rep_pen, pres_pen, stop_raw,
|
||||
api_suffix, log_state
|
||||
):
|
||||
from queue import Queue, Empty
|
||||
|
||||
user = user_msg["text"] if isinstance(user_msg, dict) and "text" in user_msg else user_msg
|
||||
|
||||
if api_suffix == "/v1/chat/completions":
|
||||
# 给 LLM 的完整 history(用于上下文推理)
|
||||
messages = history[:]
|
||||
messages.append({"role": "user", "content": user})
|
||||
prompt_input = messages
|
||||
else:
|
||||
prompt_input = user
|
||||
|
||||
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
|
||||
samp = {
|
||||
("max_tokens" if api_suffix == "/v1/completions" else "max_new_tokens"): int(max_new),
|
||||
"temperature": temp,
|
||||
"top_p": top_p,
|
||||
"top_k": int(top_k),
|
||||
"repetition_penalty": rep_pen,
|
||||
"presence_penalty": pres_pen,
|
||||
**({"stop": stop} if stop else {})
|
||||
}
|
||||
|
||||
result_q = Queue()
|
||||
|
||||
def worker():
|
||||
out = backend(prompt_input, samp, api_suffix)
|
||||
result_q.put(out)
|
||||
|
||||
thread = threading.Thread(target=worker, daemon=True)
|
||||
thread.start()
|
||||
|
||||
if api_suffix == "/v1/chat/completions":
|
||||
while True:
|
||||
if not thread.is_alive() and result_q.empty():
|
||||
break
|
||||
try:
|
||||
result = result_q.get(timeout=0.1)
|
||||
except Empty:
|
||||
continue
|
||||
|
||||
txt = result.strip() if isinstance(result, str) else str(result).strip()
|
||||
|
||||
yield {"text": txt}, log_state
|
||||
return
|
||||
else:
|
||||
while thread.is_alive():
|
||||
try:
|
||||
result = result_q.get(timeout=0.1)
|
||||
break
|
||||
except Empty:
|
||||
continue
|
||||
|
||||
if isinstance(result, str):
|
||||
result = {"text": result}
|
||||
elif not isinstance(result, dict) or "text" not in result:
|
||||
result = {"text": str(result)}
|
||||
|
||||
yield result["text"], log_state
|
||||
return
|
||||
|
||||
|
||||
# ────────────────── Gradio UI ──────────────────
|
||||
with gr.Blocks(title="调试界面") as demo:
|
||||
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
|
||||
|
||||
with gr.Row():
|
||||
api_choice = gr.Dropdown(choices=["/generate", "/v1/completions", "/v1/chat/completions"],
|
||||
value="/generate", label="选择推理接口")
|
||||
|
||||
with gr.Row():
|
||||
max_new = gr.Slider(32, 32768, 1024, label="max_new_tokens")
|
||||
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
|
||||
with gr.Row():
|
||||
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
|
||||
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
|
||||
with gr.Row():
|
||||
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
|
||||
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
|
||||
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
|
||||
|
||||
log_state = gr.State("")
|
||||
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False)
|
||||
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False)
|
||||
|
||||
chat = gr.ChatInterface(
|
||||
fn=chat,
|
||||
additional_inputs=[max_new, temp, top_p, top_k,
|
||||
rep_pen, pres_pen, stop_txt,
|
||||
api_choice, log_state],
|
||||
additional_outputs=[log_state],
|
||||
type="messages"
|
||||
)
|
||||
|
||||
timer = gr.Timer(1.0, render=True)
|
||||
timer.tick(
|
||||
fn=consume_logs,
|
||||
inputs=[],
|
||||
outputs=[log_box],
|
||||
)
|
||||
|
||||
def clear_all_logs(_):
|
||||
global LOG_Q, LOG_TXT, prev_log_value
|
||||
with LOG_Q.mutex:
|
||||
LOG_Q.queue.clear()
|
||||
LOG_TXT = ""
|
||||
prev_log_value = ""
|
||||
return gr.update(value=""), gr.update(value="")
|
||||
|
||||
api_choice.change(fn=clear_all_logs, inputs=api_choice, outputs=[log_state, log_box])
|
||||
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
|
||||
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
|
||||
|
||||
|
||||
demo.launch(server_name="0.0.0.0", server_port=30001)
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
import gradio as gr
|
||||
import requests
|
||||
|
||||
API_URL = "http://localhost:30000/v1/completions"
|
||||
API_KEY = "token-abc123"
|
||||
MODEL_NAME = "Qwen3-14b-base"
|
||||
|
||||
# 构造 prompt:Base 模型靠拼接上下文
|
||||
def build_prompt(history, user_message):
|
||||
prompt = ""
|
||||
for user, bot in history:
|
||||
prompt += f"User: {user}\nAssistant: {bot}\n"
|
||||
prompt += f"User: {user_message}\nAssistant:"
|
||||
return prompt
|
||||
|
||||
# 主对话函数
|
||||
def chat(user_message, history, max_tokens, temperature):
|
||||
prompt = build_prompt(history, user_message)
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stop": ["\nUser:", "\nAssistant:"]
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
|
||||
result = response.json()
|
||||
reply = result["choices"][0]["text"].strip()
|
||||
except Exception as e:
|
||||
reply = f"[请求失败] {e}"
|
||||
|
||||
return reply
|
||||
|
||||
# 手动测试 API 功能
|
||||
def test_api_connection(max_tokens, temperature):
|
||||
headers = {
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"prompt": "Ping?",
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(API_URL, headers=headers, json=payload, timeout=10)
|
||||
out = resp.json()["choices"][0]["text"].strip()
|
||||
return f"✅ API 可用,响应: {out}"
|
||||
except Exception as e:
|
||||
return f"❌ API 请求失败: {e}"
|
||||
|
||||
# Gradio 控件组合
|
||||
with gr.Blocks(title="Base 模型测试 UI") as demo:
|
||||
gr.Markdown("# 💬 Base 模型对话界面")
|
||||
|
||||
with gr.Row():
|
||||
max_tokens = gr.Slider(32, 1024, value=256, label="max_tokens")
|
||||
temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
|
||||
test_btn = gr.Button("🔁 测试 API 可用性")
|
||||
test_output = gr.Textbox(label="API 测试结果", interactive=False)
|
||||
|
||||
chatbot = gr.ChatInterface(
|
||||
fn=lambda msg, hist: chat(msg, hist, max_tokens.value, temperature.value),
|
||||
title=None
|
||||
)
|
||||
|
||||
test_btn.click(fn=test_api_connection, inputs=[max_tokens, temperature], outputs=test_output)
|
||||
|
||||
# 启动服务
|
||||
demo.launch(server_name="0.0.0.0", server_port=30001)
|
||||
153
meta_ui_old.py
153
meta_ui_old.py
|
|
@ -1,153 +0,0 @@
|
|||
import json, datetime, textwrap, requests, gradio as gr
|
||||
from pathlib import Path
|
||||
from collections import deque
|
||||
import queue, threading, time
|
||||
|
||||
# ───────────────────── 基础配置 ─────────────────────
|
||||
API_URL = "http://localhost:30000/generate"
|
||||
API_KEY = "token-abc123"
|
||||
MODEL_PATH = Path("/root/.cradle/Alibaba/Qwen3-30B-A3B-Base")
|
||||
|
||||
def model_name(path: Path):
|
||||
cfg = path / "config.json"
|
||||
if cfg.exists():
|
||||
data = json.load(cfg.open())
|
||||
return data.get("architectures", [None])[0] or data.get("model_type") or path.name
|
||||
return path.name
|
||||
|
||||
MODEL_NAME = model_name(MODEL_PATH)
|
||||
now = lambda: datetime.datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
# ───────────────────── 日志队列 ─────────────────────
|
||||
LOG_Q: "queue.Queue[str]" = queue.Queue()
|
||||
LOG_TXT = "" # ✅ 全局日志缓存,避免 chat 焦点阻断 log_box 更新
|
||||
|
||||
def log(msg): # 写终端 + 推队列
|
||||
print(msg, flush=True)
|
||||
LOG_Q.put(msg)
|
||||
|
||||
prev_log_value = "" # 上一帧的日志内容
|
||||
|
||||
def consume_logs(dummy=None):
|
||||
"""每秒更新 log_box 内容,避免 chat 阻塞 UI 刷新"""
|
||||
global LOG_TXT, prev_log_value
|
||||
buf = deque(LOG_TXT.splitlines(), maxlen=400)
|
||||
while not LOG_Q.empty():
|
||||
buf.append(LOG_Q.get())
|
||||
LOG_TXT = "\n".join(buf)
|
||||
if LOG_TXT != prev_log_value:
|
||||
prev_log_value = LOG_TXT
|
||||
return gr.update(value=LOG_TXT)
|
||||
return gr.update() # 无更新则不触发前端刷新
|
||||
|
||||
|
||||
# ───────────────────── 后端调用 ─────────────────────
|
||||
def backend(text, sampling):
|
||||
payload = {"model": MODEL_NAME, "text": text, "sampling_params": sampling}
|
||||
log(f"\n🟡 [{now()}] payload\n{json.dumps(payload, ensure_ascii=False, indent=2)}")
|
||||
try:
|
||||
r = requests.post(API_URL,
|
||||
headers={"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"},
|
||||
json=payload, timeout=180)
|
||||
try:
|
||||
data = r.json()
|
||||
except Exception:
|
||||
data = {}
|
||||
fr = data.get("meta_info", {}).get("finish_reason")
|
||||
ctok = data.get("meta_info", {}).get("completion_tokens")
|
||||
log(f"🟢 [{now()}] HTTP {r.status_code} tokens={ctok} finish={fr}\n"
|
||||
f"🟢 resp800={r.text[:800]!r}")
|
||||
if r.status_code != 200:
|
||||
return f"[HTTP {r.status_code}] {r.text[:300]}"
|
||||
return data.get("text", "").strip() or "[⚠ 空]"
|
||||
except Exception as e:
|
||||
log(f"[❌ 请求异常] {e}")
|
||||
return f"[❌ 请求异常] {e}"
|
||||
|
||||
# ───────────────────── Chat 回调 ─────────────────────
|
||||
def chat(
|
||||
user, history,
|
||||
max_new, temp, top_p, top_k,
|
||||
rep_pen, pres_pen, stop_raw,
|
||||
log_state
|
||||
):
|
||||
import threading
|
||||
from queue import Queue, Empty
|
||||
|
||||
stop = [s.strip() for s in stop_raw.split(",") if s.strip()] or None
|
||||
samp = {
|
||||
"max_new_tokens": int(max_new),
|
||||
"temperature": temp,
|
||||
"top_p": top_p,
|
||||
"top_k": int(top_k),
|
||||
"repetition_penalty": rep_pen,
|
||||
"presence_penalty": pres_pen,
|
||||
**({"stop": stop} if stop else {})
|
||||
}
|
||||
|
||||
result_q = Queue()
|
||||
|
||||
# 后台线程执行 backend 推理
|
||||
def worker():
|
||||
out = backend(user, samp)
|
||||
result_q.put(out)
|
||||
|
||||
thread = threading.Thread(target=worker)
|
||||
thread.start()
|
||||
|
||||
# 先返回提示
|
||||
yield "⏳ 正在生成中...", log_state
|
||||
|
||||
# 每 0.1 秒轮询结果队列(避免阻塞 UI)
|
||||
while thread.is_alive() or not result_q.empty():
|
||||
try:
|
||||
result = result_q.get(timeout=0.1)
|
||||
yield result, log_state
|
||||
except Empty:
|
||||
continue
|
||||
|
||||
|
||||
# ───────────────────── Gradio UI ─────────────────────
|
||||
with gr.Blocks(title="调试界面") as demo:
|
||||
gr.Markdown(f"## 💬 调试界面 \n权重 **{MODEL_PATH.name}**")
|
||||
|
||||
# 采样参数控件
|
||||
with gr.Row():
|
||||
max_new = gr.Slider(32, 32768, 128, label="max_new_tokens")
|
||||
temp = gr.Slider(0, 1.5, 0.8, step=0.05, label="temperature")
|
||||
with gr.Row():
|
||||
top_p = gr.Slider(0, 1, 0.95, step=0.01, label="top_p")
|
||||
top_k = gr.Slider(0, 200, 50, step=1, label="top_k")
|
||||
with gr.Row():
|
||||
rep_pen = gr.Slider(0.8, 2, 1.05, step=0.01, label="repetition_penalty")
|
||||
pres_pen= gr.Slider(0, 2, 0.0, step=0.05, label="presence_penalty")
|
||||
stop_txt = gr.Textbox("", label="stop 序列(逗号分隔)")
|
||||
|
||||
log_state = gr.State("") # 状态透传
|
||||
dbg_chk = gr.Checkbox(label="📜 显示 Debug 面板", value=False) # ✅ 默认关闭
|
||||
log_box = gr.Textbox(label="实时日志", lines=20, interactive=False, visible=False) # ✅ 默认隐藏
|
||||
|
||||
# Chat 界面(移到日志之前)
|
||||
chatbot = gr.ChatInterface(
|
||||
fn=chat,
|
||||
additional_inputs=[max_new, temp, top_p, top_k,
|
||||
rep_pen, pres_pen, stop_txt, log_state],
|
||||
additional_outputs=[log_state],
|
||||
type="messages"
|
||||
)
|
||||
|
||||
|
||||
# 日志刷新定时器
|
||||
timer = gr.Timer(1.0, render=True)
|
||||
timer.tick(
|
||||
fn=consume_logs,
|
||||
inputs=[],
|
||||
outputs=[log_box],
|
||||
)
|
||||
|
||||
log_state.change(lambda txt: gr.update(value=txt), log_state, log_box)
|
||||
dbg_chk.change(lambda v: gr.update(visible=v), dbg_chk, log_box)
|
||||
|
||||
|
||||
demo.launch(server_name="0.0.0.0", server_port=30001)
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
{
|
||||
"64": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 32,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 64,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
}
|
||||
}
|
||||
|
|
@ -216,13 +216,9 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
|||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
"""Check the health of the http server and return version info."""
|
||||
return {
|
||||
"status": "ok",
|
||||
"name": "sglang_0.4.8.post1",
|
||||
"version": "v1.0.0" # 这里写上你希望显示的版本号
|
||||
}
|
||||
async def health() -> Response:
|
||||
"""Check the health of the http server."""
|
||||
return Response(status_code=200)
|
||||
|
||||
|
||||
@app.get("/health_generate")
|
||||
|
|
|
|||
|
|
@ -868,22 +868,12 @@ def set_ulimit(target_soft_limit=65535):
|
|||
def add_api_key_middleware(app, api_key: str):
|
||||
@app.middleware("http")
|
||||
async def authentication(request, call_next):
|
||||
# OPTIONS 请求(CORS 预检)直接放行
|
||||
if request.method == "OPTIONS":
|
||||
return await call_next(request)
|
||||
|
||||
# 明确列出无需鉴权的路径前缀
|
||||
whitelist_prefixes = (
|
||||
"/health",
|
||||
"/metrics",
|
||||
"/ping",
|
||||
"/get_model_info",
|
||||
)
|
||||
|
||||
if any(request.url.path.startswith(prefix) for prefix in whitelist_prefixes):
|
||||
if request.url.path.startswith("/health"):
|
||||
return await call_next(request)
|
||||
if request.url.path.startswith("/metrics"):
|
||||
return await call_next(request)
|
||||
|
||||
# Bearer Token 鉴权
|
||||
if request.headers.get("Authorization") != "Bearer " + api_key:
|
||||
return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
|
||||
return await call_next(request)
|
||||
|
|
|
|||
|
|
@ -1,23 +0,0 @@
|
|||
[supervisord]
|
||||
nodaemon=true
|
||||
logfile=/dev/stdout
|
||||
logfile_maxbytes=0
|
||||
loglevel=info
|
||||
|
||||
[program:sglang]
|
||||
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/external/llm/ --lora-paths q3=/root/.cradle/external/lora/q3 --disable-radix-cache --tp 4 --api-key token-abc123 --enable-metrics
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
[program:ui]
|
||||
command=python3 /app/meta_ui.py --port 30001
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
[supervisord]
|
||||
nodaemon=true
|
||||
logfile=/dev/stdout
|
||||
logfile_maxbytes=0
|
||||
loglevel=info
|
||||
|
||||
[program:sglang]
|
||||
command=python3 -m sglang.launch_server --host 0.0.0.0 --port 30000 --model-path /root/.cradle/Alibaba/Qwen3-30B-A3B/ --tp 4 --api-key token-abc123 --enable-metrics
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
[program:ui]
|
||||
command=python3 /app/meta_ui.py --port 30001
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
Loading…
Reference in New Issue