ARG CUDA_VERSION=12.1 ARG PYTHON_VERSION=3.10 ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 #################### BASE BUILD IMAGE #################### # prepare basic build environment FROM ${BUILD_BASE_IMAGE} AS base ARG CUDA_VERSION ARG PYTHON_VERSION ARG TARGETPLATFORM ARG INSTALL_KV_CONNECTORS=false ENV DEBIAN_FRONTEND=noninteractive ARG DEADSNAKES_MIRROR_URL ARG DEADSNAKES_GPGKEY_URL ARG GET_PIP_URL # Install Python and other dependencies RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ && apt-get install -y ccache software-properties-common git curl sudo \ python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && curl -sSf ${GET_PIP_URL} | python3 \ && python3 --version && python3 -m pip --version ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Install uv for faster pip installs RUN --mount=type=cache,target=/root/.cache/uv \ python3 -m pip install uv ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" RUN apt-get install -y gcc-10 g++-10 RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 RUN <> /etc/environment # Install Python and other dependencies RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ && apt-get install -y ccache software-properties-common git curl wget sudo vim \ ffmpeg libsm6 libxext6 libgl1 \ python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && curl -sSf ${GET_PIP_URL} | python3 \ && python3 --version && python3 -m pip --version ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Install uv for faster pip installs RUN --mount=type=cache,target=/root/.cache/uv \ python3 -m pip install uv # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ uv pip install --system \ --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \ uv pip install --system \ --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ --pre pytorch_triton==3.3.0+gitab727c40 ; \ fi # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --system dist/*.whl --verbose \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" ARG FLASHINFER_GIT_REF="v0.2.8rc1" RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment git clone --depth 1 --recursive --shallow-submodules \ --branch ${FLASHINFER_GIT_REF} \ ${FLASHINFER_GIT_REPO} flashinfer # Exclude CUDA arches for older versions (11.x and 12.0-12.7) # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. if [[ "${CUDA_VERSION}" == 11.* ]]; then FI_TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9" elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then FI_TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0a" else # CUDA 12.8+ supports 10.0a and 12.0 FI_TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0a 10.0a 12.0" fi echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" # Needed to build AOT kernels pushd flashinfer TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ python3 -m flashinfer.aot TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ uv pip install --system --no-build-isolation . popd rm -rf flashinfer \ ~/.cache/flashinfer/aot/*/unused* BASH COPY ./vllm_v0.10.0/examples examples COPY ./vllm_v0.10.0/benchmarks benchmarks COPY ./vllm_v0.10.0/vllm/collect_env.py . RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ uv pip list COPY ./vllm_v0.10.0/requirements/build.txt requirements/build.txt RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/build.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') #################### vLLM installation IMAGE #################### #################### OPENAI API SERVER #################### # base openai image with additional requirements, for any subsequent openai-style images FROM vllm-base AS vllm-openai-base ARG TARGETPLATFORM ARG INSTALL_KV_CONNECTORS=false # ---- Add Tini as the container init process ENV TINI_VERSION=v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini RUN chmod +x /tini ENTRYPOINT ["/tini", "--"] ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 COPY ./vllm_v0.10.0/requirements/kv_connectors.txt requirements/kv_connectors.txt # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ uv pip install --system -r ./vllm_v0.10.0/requirements/kv_connectors.txt; \ fi; \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ BITSANDBYTES_VERSION="0.42.0"; \ else \ BITSANDBYTES_VERSION="0.46.1"; \ fi; \ uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] gradio==5.38.2 ENV VLLM_USAGE_SOURCE production-docker-image # ---- Install supervisord ---- RUN apt-get update && apt-get install -y supervisor && mkdir -p /etc/supervisor/conf.d # ---- Copy UI and supervisor config ---- COPY ./meta_ui.py /app/meta_ui.py COPY ./supervisord.conf /etc/supervisor/supervisord.conf # COPY ./Alibaba/Qwen3-30B-A3B /root/.cradle/Alibaba/Qwen3-30B-A3B COPY ./Alibaba/Qwen3-4B /root/.cradle/Alibaba/Qwen3-4B FROM vllm-openai-base AS vllm-openai # ---- 暴露端口 ---- EXPOSE 30000 30001 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] #################### OPENAI API SERVER ####################