diff --git a/Dockerfile b/Dockerfile index 868b817..e8e1bf5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,525 +1,441 @@ - -# The vLLM Dockerfile is used to construct vLLM image that can be directly used -# to run the OpenAI compatible server. - -# Please update any changes made here to -# docs/contributing/dockerfile/dockerfile.md and -# docs/assets/contributing/dockerfile-stages-dependency.png - -ARG CUDA_VERSION=12.8.1 -ARG PYTHON_VERSION=3.12 - -# By parameterizing the base images, we allow third-party to use their own -# base images. One use case is hermetic builds with base images stored in -# private registries that use a different repository naming conventions. -# -# Example: -# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 - -# By parameterizing the Deadsnakes repository URL, we allow third-party to use -# their own mirror. When doing so, we don't benefit from the transparent -# installation of the GPG key of the PPA, as done by add-apt-repository, so we -# also need a URL for the GPG key. -ARG DEADSNAKES_MIRROR_URL -ARG DEADSNAKES_GPGKEY_URL - -# The PyPA get-pip.py script is a self contained script+zip file, that provides -# both the installer script and the pip base85-encoded zip archive. This allows -# bootstrapping pip in environment where a dsitribution package does not exist. -# -# By parameterizing the URL for get-pip.py installation script, we allow -# third-party to use their own copy of the script stored in a private mirror. -# We set the default value to the PyPA owned get-pip.py script. -# -# Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py -ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" - -# PIP supports fetching the packages from custom indexes, allowing third-party -# to host the packages in private mirrors. The PIP_INDEX_URL and -# PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the -# default indexes. By letting them empty by default, PIP will use its default -# indexes if the build process doesn't override the indexes. -# -# Uv uses different variables. We set them by default to the same values as -# PIP, but they can be overridden. -ARG PIP_INDEX_URL -ARG PIP_EXTRA_INDEX_URL -ARG UV_INDEX_URL=${PIP_INDEX_URL} -ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} - -# PyTorch provides its own indexes for standard and nightly builds -ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly - -# PIP supports multiple authentication schemes, including keyring -# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to -# disabled by default, we allow third-party to use keyring authentication for -# their private Python indexes, while not changing the default behavior which -# is no authentication. -# -# Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support -ARG PIP_KEYRING_PROVIDER=disabled -ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER} - -# Flag enables built-in KV-connector dependency libs into docker images -ARG INSTALL_KV_CONNECTORS=false - -#################### BASE BUILD IMAGE #################### -# prepare basic build environment -FROM ${BUILD_BASE_IMAGE} AS base -ARG CUDA_VERSION -ARG PYTHON_VERSION -ARG TARGETPLATFORM -ARG INSTALL_KV_CONNECTORS=false -ENV DEBIAN_FRONTEND=noninteractive - -ARG DEADSNAKES_MIRROR_URL -ARG DEADSNAKES_GPGKEY_URL -ARG GET_PIP_URL - -# Install Python and other dependencies -RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ - && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ - && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo \ - && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \ - if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \ - mkdir -p -m 0755 /etc/apt/keyrings ; \ - curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \ - sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \ - echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \ - fi ; \ - else \ - for i in 1 2 3; do \ - add-apt-repository -y ppa:deadsnakes/ppa && break || \ - { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ - done ; \ - fi \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ - && python3 --version && python3 -m pip --version - -ARG PIP_INDEX_URL UV_INDEX_URL -ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL -ARG PYTORCH_CUDA_INDEX_BASE_URL -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL -ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER - -# Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/uv \ - python3 -m pip install uv - -# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out -# Reference: https://github.com/astral-sh/uv/pull/1694 -ENV UV_HTTP_TIMEOUT=500 -ENV UV_INDEX_STRATEGY="unsafe-best-match" - -# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 -# as it was causing spam when compiling the CUTLASS kernels -RUN apt-get install -y gcc-10 g++-10 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 -RUN <> /etc/environment - -# Install Python and other dependencies -RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ - && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ - && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ - && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \ - if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \ - mkdir -p -m 0755 /etc/apt/keyrings ; \ - curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \ - sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \ - echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \ - fi ; \ - else \ - for i in 1 2 3; do \ - add-apt-repository -y ppa:deadsnakes/ppa && break || \ - { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ - done ; \ - fi \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ - && python3 --version && python3 -m pip --version - -ARG PIP_INDEX_URL UV_INDEX_URL -ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL -ARG PYTORCH_CUDA_INDEX_BASE_URL -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL -ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER - -# Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/uv \ - python3 -m pip install uv - -# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out -# Reference: https://github.com/astral-sh/uv/pull/1694 -ENV UV_HTTP_TIMEOUT=500 -ENV UV_INDEX_STRATEGY="unsafe-best-match" - -# Workaround for https://github.com/openai/triton/issues/2507 and -# https://github.com/pytorch/pytorch/issues/107960 -- hopefully -# this won't be needed for future versions of this docker image -# or future versions of triton. -RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ - -# arm64 (GH200) build follows the practice of "use existing pytorch" build, -# we need to install torch and torchvision from the nightly builds first, -# pytorch will not appear as a vLLM dependency in all of the following steps -# after this step -RUN --mount=type=cache,target=/root/.cache/uv \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - --pre pytorch_triton==3.3.0+gitab727c40 ; \ - fi - -# Install vllm wheel first, so that torch etc will be installed. -RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ - --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system dist/*.whl --verbose \ - --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') - -# If we need to build FlashInfer wheel before its release: -# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ -# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' -# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive -# $ cd flashinfer -# $ git checkout v0.2.6.post1 -# $ python -m flashinfer.aot -# $ python -m build --no-isolation --wheel -# $ ls -la dist -# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl -# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl - -# Install FlashInfer from source -ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" -ARG FLASHINFER_GIT_REF="v0.2.8rc1" -RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' - . /etc/environment - git clone --depth 1 --recursive --shallow-submodules \ - --branch ${FLASHINFER_GIT_REF} \ - ${FLASHINFER_GIT_REPO} flashinfer - # Exclude CUDA arches for older versions (11.x and 12.0-12.7) - # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. - if [[ "${CUDA_VERSION}" == 11.* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" - else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" - fi - echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" - # Needed to build AOT kernels - pushd flashinfer - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer.aot - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation . - popd - rm -rf flashinfer -BASH -COPY examples examples -COPY benchmarks benchmarks -COPY ./vllm/collect_env.py . - -RUN --mount=type=cache,target=/root/.cache/uv \ -. /etc/environment && \ -uv pip list - -# Even when we build Flashinfer with AOT mode, there's still -# some issues w.r.t. JIT compilation. Therefore we need to -# install build dependencies for JIT compilation. -# TODO: Remove this once FlashInfer AOT wheel is fixed -COPY requirements/build.txt requirements/build.txt -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/build.txt \ - --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') - -#################### vLLM installation IMAGE #################### - -#################### TEST IMAGE #################### -# image to run unit testing suite -# note that this uses vllm installed by `pip` -FROM vllm-base AS test - -ADD . /vllm-workspace/ - -ARG PYTHON_VERSION - -ARG PIP_INDEX_URL UV_INDEX_URL -ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL - -# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out -# Reference: https://github.com/astral-sh/uv/pull/1694 -ENV UV_HTTP_TIMEOUT=500 -ENV UV_INDEX_STRATEGY="unsafe-best-match" - -# Workaround for #17068 -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" - -# install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ - if [ "$CUDA_MAJOR" -ge 12 ]; then \ - uv pip install --system -r requirements/dev.txt; \ - fi - -# install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -e tests/vllm_test_utils - -# enable fast downloads from hf (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system hf_transfer -ENV HF_HUB_ENABLE_HF_TRANSFER 1 - -# Copy in the v1 package for testing (it isn't distributed yet) -COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 - -# doc requires source code -# we hide them inside `test_docs/` , so that this source code -# will not be imported by other tests -RUN mkdir test_docs -RUN mv docs test_docs/ -RUN cp -r examples test_docs/ -RUN mv vllm test_docs/ -RUN mv mkdocs.yaml test_docs/ -#################### TEST IMAGE #################### - -#################### OPENAI API SERVER #################### -# base openai image with additional requirements, for any subsequent openai-style images -FROM vllm-base AS vllm-openai-base -ARG TARGETPLATFORM -ARG INSTALL_KV_CONNECTORS=false - -ARG PIP_INDEX_URL UV_INDEX_URL -ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL - -# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out -# Reference: https://github.com/astral-sh/uv/pull/1694 -ENV UV_HTTP_TIMEOUT=500 - -COPY requirements/kv_connectors.txt requirements/kv_connectors.txt - -# install additional dependencies for openai api server -RUN --mount=type=cache,target=/root/.cache/uv \ - if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ - uv pip install --system -r requirements/kv_connectors.txt; \ - fi; \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - BITSANDBYTES_VERSION="0.42.0"; \ - else \ - BITSANDBYTES_VERSION="0.46.1"; \ - fi; \ - uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] - -ENV VLLM_USAGE_SOURCE production-docker-image - -# define sagemaker first, so it is not default from `docker build` -FROM vllm-openai-base AS vllm-sagemaker - -COPY examples/online_serving/sagemaker-entrypoint.sh . -RUN chmod +x sagemaker-entrypoint.sh -ENTRYPOINT ["./sagemaker-entrypoint.sh"] +# ARG CUDA_VERSION=12.8.1 +# ARG PYTHON_VERSION=3.12 + + +# ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +# ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 + + +# ARG DEADSNAKES_MIRROR_URL +# ARG DEADSNAKES_GPGKEY_URL + +# ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" + +# ARG PIP_INDEX_URL +# ARG PIP_EXTRA_INDEX_URL +# ARG UV_INDEX_URL=${PIP_INDEX_URL} +# ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} + +# # PyTorch provides its own indexes for standard and nightly builds +# ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl +# ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly + +# ARG PIP_KEYRING_PROVIDER=disabled +# ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER} + +# # Flag enables built-in KV-connector dependency libs into docker images +# ARG INSTALL_KV_CONNECTORS=false + +# #################### BASE BUILD IMAGE #################### +# # prepare basic build environment +# FROM ${BUILD_BASE_IMAGE} AS base +# ARG CUDA_VERSION +# ARG PYTHON_VERSION +# ARG TARGETPLATFORM +# ARG INSTALL_KV_CONNECTORS=false +# ENV DEBIAN_FRONTEND=noninteractive + +# ARG DEADSNAKES_MIRROR_URL +# ARG DEADSNAKES_GPGKEY_URL +# ARG GET_PIP_URL + +# # Install Python and other dependencies +# RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ +# && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ +# && apt-get update -y \ +# && apt-get install -y ccache software-properties-common git curl sudo \ +# && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \ +# if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \ +# mkdir -p -m 0755 /etc/apt/keyrings ; \ +# curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \ +# sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \ +# echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \ +# fi ; \ +# else \ +# for i in 1 2 3; do \ +# add-apt-repository -y ppa:deadsnakes/ppa && break || \ +# { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ +# done ; \ +# fi \ +# && apt-get update -y \ +# && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ +# && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ +# && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ +# && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ +# && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ +# && python3 --version && python3 -m pip --version + +# ARG PIP_INDEX_URL UV_INDEX_URL +# ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL +# ARG PYTORCH_CUDA_INDEX_BASE_URL +# ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL +# ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER + +# # Install uv for faster pip installs +# RUN --mount=type=cache,target=/root/.cache/uv \ +# python3 -m pip install uv + +# ENV UV_HTTP_TIMEOUT=500 +# ENV UV_INDEX_STRATEGY="unsafe-best-match" + +# RUN apt-get install -y gcc-10 g++-10 +# RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 +# RUN <> /etc/environment + +# # Install Python and other dependencies +# RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ +# && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ +# && apt-get update -y \ +# && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ +# && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ +# && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \ +# if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \ +# mkdir -p -m 0755 /etc/apt/keyrings ; \ +# curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \ +# sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \ +# echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \ +# fi ; \ +# else \ +# for i in 1 2 3; do \ +# add-apt-repository -y ppa:deadsnakes/ppa && break || \ +# { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ +# done ; \ +# fi \ +# && apt-get update -y \ +# && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ +# && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ +# && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ +# && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ +# && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ +# && python3 --version && python3 -m pip --version + +# ARG PIP_INDEX_URL UV_INDEX_URL +# ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL +# ARG PYTORCH_CUDA_INDEX_BASE_URL +# ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL +# ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER + +# # Install uv for faster pip installs +# RUN --mount=type=cache,target=/root/.cache/uv \ +# python3 -m pip install uv + +# # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# # Reference: https://github.com/astral-sh/uv/pull/1694 +# ENV UV_HTTP_TIMEOUT=500 +# ENV UV_INDEX_STRATEGY="unsafe-best-match" + +# RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ + +# RUN --mount=type=cache,target=/root/.cache/uv \ +# if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ +# uv pip install --system \ +# --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ +# "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \ +# uv pip install --system \ +# --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ +# --pre pytorch_triton==3.3.0+gitab727c40 ; \ +# fi + +# # Install vllm wheel first, so that torch etc will be installed. +# RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ +# --mount=type=cache,target=/root/.cache/uv \ +# uv pip install --system dist/*.whl --verbose \ +# --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + +# # Install FlashInfer from source +# ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" +# ARG FLASHINFER_GIT_REF="v0.2.8rc1" +# RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' +# . /etc/environment +# git clone --depth 1 --recursive --shallow-submodules \ +# --branch ${FLASHINFER_GIT_REF} \ +# ${FLASHINFER_GIT_REPO} flashinfer +# # Exclude CUDA arches for older versions (11.x and 12.0-12.7) +# # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. +# if [[ "${CUDA_VERSION}" == 11.* ]]; then +# FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" +# elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then +# FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" +# else +# # CUDA 12.8+ supports 10.0a and 12.0 +# FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" +# fi +# echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" +# # Needed to build AOT kernels +# pushd flashinfer +# TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ +# python3 -m flashinfer.aot +# TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ +# uv pip install --system --no-build-isolation . +# popd +# rm -rf flashinfer +# BASH +# COPY examples examples +# COPY benchmarks benchmarks +# COPY ./vllm_v0.10.0/vllm/collect_env.py . + +# RUN --mount=type=cache,target=/root/.cache/uv \ +# . /etc/environment && \ +# uv pip list + + +# COPY ./vllm_v0.10.0/requirements/build.txt requirements/build.txt +# RUN --mount=type=cache,target=/root/.cache/uv \ +# uv pip install --system -r ./vllm_v0.10.0/requirements/build.txt \ +# --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + +# #################### vLLM installation IMAGE #################### + +# #################### TEST IMAGE #################### +# # image to run unit testing suite +# # note that this uses vllm installed by `pip` +# FROM vllm-base AS test + +# ADD ./vllm_v0.10.0/. /vllm-workspace/ + +# ARG PYTHON_VERSION + +# ARG PIP_INDEX_URL UV_INDEX_URL +# ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL + +# # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# # Reference: https://github.com/astral-sh/uv/pull/1694 +# ENV UV_HTTP_TIMEOUT=500 +# ENV UV_INDEX_STRATEGY="unsafe-best-match" + +# # Workaround for #17068 +# RUN --mount=type=cache,target=/root/.cache/uv \ +# uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" + +# # install development dependencies (for testing) +# RUN --mount=type=cache,target=/root/.cache/uv \ +# CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ +# if [ "$CUDA_MAJOR" -ge 12 ]; then \ +# uv pip install --system -r requirements/dev.txt; \ +# fi + +# # install development dependencies (for testing) +# RUN --mount=type=cache,target=/root/.cache/uv \ +# uv pip install --system -e tests/vllm_test_utils + +# # enable fast downloads from hf (for testing) +# RUN --mount=type=cache,target=/root/.cache/uv \ +# uv pip install --system hf_transfer +# ENV HF_HUB_ENABLE_HF_TRANSFER 1 + +# # Copy in the v1 package for testing (it isn't distributed yet) +# COPY ./vllm_v0.10.0/vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 + +# # doc requires source code +# # we hide them inside `test_docs/` , so that this source code +# # will not be imported by other tests +# RUN mkdir test_docs +# RUN mv docs test_docs/ +# RUN cp -r ./vllm_v0.10.0/examples test_docs/ +# RUN mv vllm test_docs/ +# RUN mv mkdocs.yaml test_docs/ +# #################### TEST IMAGE #################### + +# #################### OPENAI API SERVER #################### +# # base openai image with additional requirements, for any subsequent openai-style images +# FROM vllm-base AS vllm-openai-base +# ARG TARGETPLATFORM +# ARG INSTALL_KV_CONNECTORS=false + +# ARG PIP_INDEX_URL UV_INDEX_URL +# ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL + +# # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# # Reference: https://github.com/astral-sh/uv/pull/1694 +# ENV UV_HTTP_TIMEOUT=500 + +# COPY ./vllm_v0.10.0/requirements/kv_connectors.txt requirements/kv_connectors.txt + +# # install additional dependencies for openai api server +# RUN --mount=type=cache,target=/root/.cache/uv \ +# if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ +# uv pip install --system -r requirements/kv_connectors.txt; \ +# fi; \ +# if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ +# BITSANDBYTES_VERSION="0.42.0"; \ +# else \ +# BITSANDBYTES_VERSION="0.46.1"; \ +# fi; \ +# uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] + +# ENV VLLM_USAGE_SOURCE production-docker-image + +# # define sagemaker first, so it is not default from `docker build` +# FROM vllm-openai-base AS vllm-sagemaker + +# COPY ./vllm_v0.10.0/examples/online_serving/sagemaker-entrypoint.sh . +# RUN chmod +x sagemaker-entrypoint.sh +# ENTRYPOINT ["./sagemaker-entrypoint.sh"] FROM vllm-openai-base AS vllm-openai